From 842ab9a1c4ed4bcdb5d5084c50ff8da4ca1bbf0b Mon Sep 17 00:00:00 2001
From: Ruoshi Zhang <rosyzhang9701@gmail.com>
Date: Tue, 12 May 2020 14:00:56 +0200
Subject: [PATCH] Squashed 'lib/mmseqs/' changes from 0a1348be7..46c843895

46c843895 Update combine pval agg-mode 3
67d610136 Disable fancy progress bars on travis to reduce output
203a21736 Updated two more tests to use tighter ROC thresholds
a9052f449 Update regression with tighter bounds for ROC tests
c62736a6d Correctly parse keys from data files in filterdb --filter-file This was causing a linsearch instability
fe007cb4e Use MultiParam for gapOpen, gapExtend costs
3513001d3 Add easy-rbh workflow
d0d3032e9 Fix RBH search if using -a to show alignments
ce1a43bf1 Merge branch 'master' of https://github.com/soedinglab/mmseqs2
ea24e4934 Fix issues with abs. path if using aria2c
5228745f5 Improve --alignment-mode parameter description and make it a non expert parameter
fffa9b10e Fix various inconsistencies and usability issues with alignall: * alignall alignment-mode did not correspond to align alignment-mode * add-backtrace did not do anything, has to be specified now if backtrace is needed * Did return a alignment db type even though it is incompatible with that type, uses generic for now * various parameters were passed but unused   - zdrop and scorebias are used now (however see below)   - realign, alt ali, max accept/reject, wrapped are now gone
290668474 Fix wrong warning
813d81f29 Update regression
264d78117 Switch greedy clustering algorithm back to old idea
c09f6574e Improve nucleotide clustering workflow
38a737708 Set k-mers in linclust to 0 for the nucleotide clustering
7df6e3f75 Replace characters that can not be reversed by N in extract frames
e9678f625 Update regression
f886e868f Add nucleotide support to cluster (workflow nucleotide_clustering), clust module will infer identity automatically if missing, Improve low. mem. greedy incremental algorithm, Update regression
5f8735872 Add kmers-per-sequence-scale to linsearch
0310eb607 Change --kmer-per-seq-scale to a multi parameter, add error if cluster is called with a nucleotide sequence
e258bc8d8 Fix #299 PDB70 database creation was not working
7095f37e4 Add support reverse complemente in rescorediagonal --rescore-mode 0 and 1
61ca48883 Fix result2dnamsa
70d014e41 Add search-type 4 to Search
462f24cbb Add module result2dnamsa
5670d990e Fix regression error
e4451d591 Add result direction parameter to kmersearch
12c499dcd Fix reverse sequences issues in linclust and linsearch
44499c3ce Update filterdb regression test
807b4a56a Fix issue https://github.com/soedinglab/MMseqs2/issues/290. Filterdb checked for mode == true but mode was 2.
24479bc27 Fix Docker
a578f52a7 Fix char signedness on PPC
a0d64a989 Update regression
a07a266f9 Working on PPC64LE support
09734177c Remove remaining _mm_shuffle_epi32
cdef78a69 Merge pull request #285 from hgsommer/misc_small
283c8d03f Replace goto end in ssw
6bfc50281 Fix c/p mistake in convertalignments
e61da3447 Fix spelling of 'length'
9a63760fa Replace nested ternary operator
4349b5c6e Avoid repeatedly checking for profile db types
c170a11f5 Call MsaFilter::shuffleSequences() from MsaFilter::filter()
ef49ba220 Return value from MsaFilter::filter()
d155dc36c Replace int by bool literals for bool variable
ec6722adc Align headings with column in PSSMCalculator::printProfile()
548a9bd68 Avoid forward declaration of ScoreMatrix
d0fbe471f Do some cleanup in StripedSmithWaterman.cpp
91d1aeddc Replace check for zero-sized containers by empty()
e47b8eed9 Remove superfluous parameter from ssw_init()
250b1221d Simplify return statements
4fe1116ae Remove counting zero scores in Sequence::mapProfile()
4303728b5 Replace multiplication by zero
1bd602420 Remove increment by zero
e4d4389f2 Move check for exit condition in front of allocations
556d26d1a Clean up function signatures in MultipleAlignment
3863af9ac Move include back to header to restore build
e1208493a Remove unused TmpResult score field
1fd4db8f2 Die if DBReader cannot reopen files (e.g. no more file handles left)
1e21b87ba Purge sequenceLookup early since its recreate in split databases
40854ddcd Prefiltering and CacheFriendlyOperations refactoring
2433e086b WASM work in progress
14014cd0e Fix prefilter overflow instability
e0f971848 Add conda forge to conda install instructions
aa175d636 Fix off by one in kmermatcher https://github.com/soedinglab/MMseqs2/issues/274#issuecomment-586298079
d1607bc8a Remove LINE_MAX
eca2155d7 Clear string buffer instead of reassigning in swapresults
0f4645edd Fix wrong reverse marking in linsearch reported by UBSAN
5b612a327 Missing mpi binaries for travis regression
83d22417a Next try for ARM compiler flags
7ad122f0a Missed a few variables
ac7914bea Do not require a cmake variable to build ARM
0dcfaadbb Update regression to fix broken samtools call on ARM
29927b4c4 More NEON fixes, we assume signed chars, ARM uses unsigned by default
7760220ff Next try to get the ARM regression to work
cc6d0d52b Add hack to not break travis log size limit
5408c3d10 Try to get NEON to compile
83192cabd Fix search workflow parameters printed twice
f6f001c8c Fix new clang-10 warnings and further travis fixes
259e64341 llvm-10 alias is not whitelisted in travis yet
b1249fd54 Fix errors in Travis YAML from previous commit
18486d4c5 Update travis - use native aarch64 for neon - use xenial - shorten script
98c37f3c3 shortend MultiParam usage, improved line breaks in usage
c9be07f1a Add gcc-9 to travis
2e5fb309a Fix travis clang build
d5865c894 Remove MultiParam g++-9 warning
73679835b Rework target split merging
ca5869397 Fix RESSIZE issue in slice search if sequences are used
491900b99 Improve usage text of cluster/linclust
0166850a2 Remove old greedy incremental clustering code and just run the memory efficient version instead.
15163e64c Fix Verbosity in workflows
aa78af463 Fix issue https://github.com/soedinglab/MMseqs2/issues/274
7846dfce3 fixed clang template error
e1206371c extended MultiParam class, replaced ScoreMatrixFile type by MultiParam<char*>
b88b54756 rewrite alphabetSize as multi parameter
ecb4e35d4 started template class MultiParam to store sequence type specific values
e1a1c1226 changed dbtype comparision in AlignmentSymmetry
2a829aef7 Replace symlinkat call with getcwd/chdir/symlink/chdir to fix Conda build using macOS 10.9 SDK
28e83e8d5 Add OpenMP include to DBReader
fb00aa0c3 Fix realloc issue while IndexTable creation of profiles
504e5021f Take max. seq. len of query and target db in prefilter and alignment
16e235214 Fix bug if seq. len > max seq. length in Alignment
80d0187de Fix asan issue
751f5c19f Make ZDROP an expert parameter, change description text
1b6edd0d4 Rework x detection (SIMD)
9677254ab Merge branch 'master' of https://github.com/soedinglab/mmseqs2
1ac1e6866 Fix max seq issues in prefilter
cb737033c Reset download strategy to not use aria2c for the NCBI download
c95f3ee0e fixed ksw2 test
72b95c0ce Error if we cannot download from NCBI
1d0aad50b Fix databases not piecing togehter all kalamari accessions
516723d53 Merge branch 'master' of https://github.com/soedinglab/MMseqs2
d81b6cca5 added zdrop parameter to control banded nucleotide alignment
e2e39a971 Add Kalamari Contaminants database
c0c538ea3 Various fixes in databases script
08cc95b3a Fix createtaxdb redownloading when taxdump already exists
018eb3498 Remove a bit whitespace in front of each parameter in usage message
8aa7513de add aggregatetax example, fix typos
8bcd7c740 Fix typo
8e581b762 Rework usage texts
7dc25764a Hide most parameters from createindex
2baa609e8 Add examples to many modules
00a7d7696 fixed bugs for long or wrapped nucleotide sequences
a4bdcb478 eggNOG profiles should not depend on the deleted MSAs
4c7830954 Fix eggNOG database construction
f7a5599c8 Cleanup not needed files immediately in databases workflow
3ed3690d4 Fix downloads always restarting in databases workflow
4cfac9a8a Fix aria warning with more than 16 connections
e0a00e10d Revert "Use SW instead of BandedNucAln if we don't have diagonals"
7ac966b2e Fix result2msa could fail if it was writing compressed output
95729ac7c Fix wrong output DB type written in alignall
f899e7c7a Use SW instead of BandedNucAln if we don't have diagonals
c08d9fa8e Allow parameter descriptions to span multiple lines
57868498e MMseqs2 is not limited to proteins, update README to reflect that
11818b0a2 Cleanup hiding parameters in workflows
c481cea60 Remove some useless includes
2f64aeeb8 Fix databases timestamp appending instead of overwriting
ae9e9e329 Add eggNOG setup procedure to databases
31c8e5d50 Shorten two short parameter descriptions
2f49d3e3e Read header from lookup in msa2profile if available
1356869b0 add option to reverese profile dbs
ac3482e80 More issues with zlib and tar2db
aaafafe43 Fix tar2db keys
c751d9e2f More tar2db fixes
a9c93014c Fix variadic input to tar2db
51a761305 Add tar2db module to convert content of any tar to a DB
96f9a91e5 Use nedmalloc on Windows/Cygwin
73f5c2a2d Add databases workflow to README
5a7ac9e54 make align output consistent
c5ebe5297 fixed setcover cluster mode (by fixing bug in similarity reading for short aln results e.g. hamming distance aln)
481696b5f Fix databases output
c6b4a57a8 Beginning cleaning up parameter descriptions
a9552a177 Show default value of bool parameters
af89c4677 Add a proposed example text structure
9c17f4eba Rework module description texts, better categories, shorten all descriptions, prepare to replace long descriptions with examples
00ff199e8 Add Resfinder DB
f1011ecb4 Fix krona again marked as vendored
02001ab03 missing mode resulted in different top1
4375463bc Header db should not have to be a unsplit db
edccbf33f Actually fix extractorfs lookup creation
041e8e558 Improve README
a8f2c7bad Remove correct workflow script in createtaxdb.sh
26c8202a9 print createdb cmd line again
df02bae34 Refactor createseqfiledb, remove stringstream
2523ebe1a do not write null byte
af847a724 Fix clang warning from DBConcat
ef1ec596f extend dbconcat to handle auxillary files
528bd2134 not needed
dec1b9215 Silence warning in GCC 4.8 casting function to void*
2d44c886d Fix extractorfs not being able to create lookup
ffe66afac Replace isnumber with isdigit. Add more tests to TestTaxExpr
fbe09867e Rework Taxon Expr parsing
f58329ef5 Add constructor to define custom functions to ExpressionParser
b6ef07281 Initialize expressionparser per thread, was not thread safe
f966bfa62 Fix reallocation issue in BandedAlignment
bbd3c2bb7 Add +1 to realloc in BandedNucleotideAligner but not to length
6b6e82ae6 Add +1 to realloc in mapSequence
75e2c8ec4 Fix off by one issues in realloc in rescorediagonal and BandedNucleotideAligner
afd14c8c2 First step to get rid of maxSeqLen
13ca612db Fix allocation issue in kermatcher if sequences are longer than > 2^16
62de5ba93 Fix off by one in computation for splits in kmermatcher
35e95d180 Change int_sequence to char (big change)
ecf82f2f4 Revert "Temporarily disable soft split mode for createdb in easy workflows"
d19219dd4 Merge branch 'master' of https://github.com/soedinglab/mmseqs2
1a0d898ec Fix softlink issue in createdb https://github.com/soedinglab/MMseqs2/issues/265
13e0fe466 Temporarily disable soft split mode for createdb in easy workflows
4487b6e14 Fix view module to work with softlinked createdb dbs
c1e9eb0e3 Fix MPI issue if only one server is used
e781c3fe5 fix MPI compile error
9bcff2844 Fix Filter2 bug of HH-suite in MMseqs2 https://github.com/soedinglab/hh-suite/pull/182
01db79d33 Fix some bugs in splitting handling
d9a887453 Fix memory splitting issues in kmermatcher, kmerindexdb
37880f083 Fix MPI in kmermatcher and indexdb
bee93123f Update regression
03a89ff1c Merge branch 'master' of https://github.com/soedinglab/mmseqs2
6ca967362 Update the way how k-mers are extracted in kmermatcher. Extraction should be now ~3 times faster.
f1388309d Introducing databases workflow to automatically setup and download common databases
d78fdbb06 Add progress to convertmsa
18acba224 Do not recreate _mapping file if it already exists in createtaxdb
63a373f5a Skip validations steps correctly if a input db is neither INPUT nor OUTPUT
d95caa1a7 Allow modules with zero parameters
9f8aff948 Allow modules to handle -h or --help themselves
cf5691f92 Typo
8ebc9d16b fixed access mode
31895414d Clarify parameter help in createdb
f644744a8 Merge branch 'master' of https://github.com/soedinglab/mmseqs2
c287719d9 Remove check for profiles for splice serach. It should also work with sequence databases.
c75fe9acf regression submodule w filtertaxseqdb
7587a872f Add one more missing check in kmermatcher
8d4e9f4fc Remove +1 from size in initKmerPositionMemory
aca141e95 Fix shellcheck error in splicesearch
8bdff50e1 Move +1 from initKmerPositionMemory outside
f12821e35 Merge branch 'master' of https://github.com/soedinglab/mmseqs2
d74b76ca5 Avoid overflow in kmermatcher if split is needed
fd90ff2c3 Move compiled data resources into subfolders
2fd9f25d2 Merge branch 'master' of https://github.com/soedinglab/mmseqs2
b439ce831 Make the slice search applicable to other databases types, not just profiles
589a2e276 Fix apply crashing on empty entries
82542a6ac Merge branch 'master' of https://github.com/soedinglab/mmseqs2
c0acdd8f3 Fix memory leak in createsubdb.
5129a956d Validate taxonomic ranks and make input/output formats consistent
53bb55b38 Fix issues in hash function https://github.com/soedinglab/MMseqs2/issues/252
764c4a3e7 Fix lca message
c013a6929 Fix LCA output message
a1206690d Change db validator from result2stats
714f5b4fb Replace mmaped input file with std c io in createsubdb
6e43e9413 Add remove .source file to rmdb
3e58bb85b Fix result2flat https://github.com/soedinglab/MMseqs2/issues/261
3e27833db Revert easycluster.sh back to result2flat. Reason is that createsubdb can not handle soft linked sequence databases (input.0 -> input.fas)
33354680f Merge branch 'master' of https://github.com/soedinglab/mmseqs2
1e92fb504 Replace result2repseq and result2flat with createsubdb and convert2fasta
55bcdd303 single step clustering could potential cluster unrelated sequences due to hash collisions
fdd0646b1 Fix clusthash issues with parallelization and nucl input
e62a1c717 Merge branch 'master' of https://github.com/soedinglab/mmseqs2
1336b7ad2 Add MSA to allDb and allDbAndFlat
48a037a2e Update Prefiltering.cpp
a1adbf52d Fix warning: Remove useless copy constructor from Matcher::result_t
d3ca42657 Remove truncatedCounter variable in QueryMatcher
4647525ec Show full help text if "Error in argument " occurs
4149ae457 Remove annoying message in prefilter (truncated result). Move it to the statistics section.
d5aab5b86 Update regression
1f1e049e6 Fix output of unclassified hits in convertalis
83ff5c601 Fix permission issues for tmp directory
cce6e6714 add support to output taxon in easy-search when using an indexed database
f200bdd62 Merge branch 'master' of https://github.com/soedinglab/mmseqs2
6f28a29ae Fix seg. fault if all sequences could be classified
473d60580 Update batches
b52668f6e Add chat icon
af54c8e8e Update README.md
7eb6a0b70 Makde addtaxonomy more resilient against invalid taxonomy mappings
3482b0e91 Merge pull request #260 from RuoshiZhang/master
36f49f5b5 Fix issue in memory computation for split
bcb97d63f Update README.md
abcd97de7 write same number of fields even if no hit
38e102181 Update regression to hopefully fix windows failure
f41511465 Fix spelling error
1fd24924e Add a search-type 4 for trans-trans search returning a nucl backtrace in offsetalignment
31f6d7ac3 add aggragatetax to assign set tax by majority vote
b6e8ee239 allow more dbtypes in swapdb
c9d02ef21 add option to view rank index
49db7258e typo fix
9c32930f3 Merge branch 'master' of github.com:soedinglab/MMseqs2
17b5494fe Fix auto detection of dbtype in createdb
8831df81d Merge branch 'master' of github.com:soedinglab/MMseqs2
be1a9822c Fix createseqfiledb https://github.com/soedinglab/MMseqs2/issues/258
02be0c4ea Fix summarizeresult to support reverse position in alignment
7ef586276 added filtertaxseqdb
00f2fd2b8 added mode for all but index
127db8c6d minor tidying for filtertaxdb
8144e7653 Merge branch 'master' of github.com:soedinglab/MMseqs2
48f77fa7d Fix ASan issue in filterdb
d722d5724 Fix warning in filterdb
4a4e6ea15 Update regression test for filterdb
31a7dc124 filterdb --join-db ignores lines it cannot join instead of crash
6c6faa96d filterdb's --extract-lines works together with --trim-to-one-column
12bee8142 filterdb can filter by rows with value within percentage #249
5c919ab95 Allow double parameters separately from floats in parsing
f9be8a88d Remove broken filterdb paths
1dc04f5e1 Refactoring of filterdb
90e3a9aaf Fix bug for enforced dbtypes in createdb
a4cee78db New regression to check stdin support
17ec97c78 Add stdin support to easy workflows
76c9e7c36 Fix compiler warnings in KSeqWrapper
0cc45536b Overwrite dbtype correctly in createdb
c0045182b Add stdin to createdb
02a88e438 use https instead of ftp for downloading taxdb data
a33bd27f4 offsetalignments now correctly returns a nucleotide backtrace if needed
456e1b5ab include VTML40 in binary for easier access
775de3850 Add missed target .source file for reading in convertalis
c08c071b2 Overload patterncompiler isMatch for pos of match
ba6aa8d12 avoid appending extra tabs besthitperset

git-subtree-dir: lib/mmseqs
git-subtree-split: 46c8438958edccd8fd09640eb174e2449529e4df
---
 .gitattributes                                |    2 +-
 .travis.yml                                   |  164 +-
 CMakeLists.txt                                |   28 +-
 Dockerfile                                    |   74 +-
 README.md                                     |   34 +-
 azure-pipelines.yml                           |    2 +-
 cmake/MMseqsResourceCompiler.cmake            |   18 +-
 cmake/MMseqsSetupDerivedTarget.cmake          |    2 +-
 data/CMakeLists.txt                           |   38 +-
 data/resources/CMakeLists.txt                 |   11 +
 .../CovSeqidQscPercMinDiag.lib                |    0
 .../CovSeqidQscPercMinDiagTargetCov.lib       |    0
 .../ExpOpt3_8_polished.cs32.lib               |    0
 data/{ => resources}/Library255_may17.lib     |    0
 data/{ => resources}/cs219.lib                |    0
 data/{ => resources}/krona_prelude.html       |    0
 data/{ => resources}/libPolished_8.lib        |    0
 data/{ => resources}/libPure_blosum62_255.lib |    0
 data/{ => resources}/libPure_blosum62_32.lib  |    0
 data/workflow/CMakeLists.txt                  |   28 +
 data/{ => workflow}/blastn.sh                 |   13 +-
 data/{ => workflow}/blastp.sh                 |    1 -
 data/{ => workflow}/blastpgp.sh               |   10 +-
 data/{ => workflow}/cascaded_clustering.sh    |   62 +-
 data/{ => workflow}/clustering.sh             |   19 +-
 data/{ => workflow}/createindex.sh            |   18 +-
 data/{ => workflow}/createtaxdb.sh            |   34 +-
 data/workflow/databases.sh                    |  292 +
 data/{ => workflow}/easycluster.sh            |   16 +-
 data/workflow/easyrbh.sh                      |   56 +
 data/{ => workflow}/easysearch.sh             |   19 +-
 data/{ => workflow}/easytaxonomy.sh           |   20 +-
 data/{ => workflow}/enrich.sh                 |    1 -
 data/{ => workflow}/linclust.sh               |   32 +-
 data/{ => workflow}/linsearch.sh              |    7 +-
 data/{ => workflow}/map.sh                    |    0
 data/{ => workflow}/multihitdb.sh             |    1 -
 data/{ => workflow}/multihitsearch.sh         |    7 +-
 data/workflow/nucleotide_clustering.sh        |  125 +
 data/{ => workflow}/rbh.sh                    |   18 +-
 .../searchslicedtargetprofile.sh              |   26 +-
 data/{ => workflow}/searchtargetprofile.sh    |   10 +-
 data/{ => workflow}/taxonomy.sh               |   31 +-
 data/{ => workflow}/translated_search.sh      |    7 +-
 data/{ => workflow}/update_clustering.sh      |   27 +-
 lib/alp/CMakeLists.txt                        |    2 +-
 lib/cacode/CMakeLists.txt                     |    2 +-
 lib/ksw2/kseq.h                               |    8 +-
 lib/ksw2/ksw2_extz2_sse.cpp                   |   14 +-
 lib/microtar/CMakeLists.txt                   |    1 +
 lib/microtar/LICENSE                          |   19 +
 lib/microtar/README.md                        |   99 +
 lib/microtar/microtar.c                       |  376 ++
 lib/microtar/microtar.h                       |   90 +
 lib/nedmalloc/CMakeLists.txt                  |    5 +
 lib/nedmalloc/License.txt                     |   23 +
 lib/nedmalloc/Readme.txt                      |  136 +
 lib/nedmalloc/malloc.c.h                      | 5761 +++++++++++++++++
 lib/nedmalloc/nedmalloc.c                     |  954 +++
 lib/nedmalloc/nedmalloc.h                     |  182 +
 lib/simd/simd.h                               |   49 +-
 lib/simd/sse2altivec.h                        |  127 +
 lib/simd/sse2neon.h                           | 4035 ++++++++----
 lib/simd/sse2wasm.h                           |  166 +
 lib/tinyexpr/CMakeLists.txt                   |    3 +
 lib/xxhash/xxh3.h                             | 1632 +++++
 lib/xxhash/xxhash.cpp                         |    5 +
 lib/xxhash/xxhash.h                           | 1671 +++++
 src/CMakeLists.txt                            |   28 +-
 src/CommandDeclarations.h                     |    6 +
 src/MMseqsBase.cpp                            | 1494 +++--
 src/alignment/Alignment.cpp                   |   52 +-
 src/alignment/Alignment.h                     |    5 +-
 src/alignment/BandedNucleotideAligner.cpp     |   65 +-
 src/alignment/BandedNucleotideAligner.h       |    6 +-
 src/alignment/Main.cpp                        |    1 +
 src/alignment/Matcher.cpp                     |   51 +-
 src/alignment/Matcher.h                       |   94 +-
 src/alignment/MsaFilter.cpp                   |   31 +-
 src/alignment/MsaFilter.h                     |   12 +-
 src/alignment/MultipleAlignment.cpp           |   31 +-
 src/alignment/MultipleAlignment.h             |   18 +-
 src/alignment/PSSMCalculator.cpp              |   36 +-
 src/alignment/PSSMCalculator.h                |    2 +-
 src/alignment/StripedSmithWaterman.cpp        |   87 +-
 src/alignment/StripedSmithWaterman.h          |   20 +-
 src/alignment/rescorediagonal.cpp             |   23 +-
 src/clustering/AlignmentSymmetry.cpp          |   48 +-
 src/clustering/Clustering.cpp                 |    2 +-
 src/clustering/Clustering.h                   |    2 -
 src/clustering/ClusteringAlgorithms.cpp       |  140 +-
 src/clustering/Main.cpp                       |   20 +-
 src/commons/A3MReader.cpp                     |    4 +-
 src/commons/Application.cpp                   |   49 +-
 src/commons/BaseMatrix.cpp                    |   22 +-
 src/commons/BaseMatrix.h                      |   10 +-
 src/commons/CMakeLists.txt                    |    4 +-
 src/commons/Command.cpp                       |   32 +-
 src/commons/Command.h                         |   43 +-
 src/commons/DBConcat.cpp                      |  234 +-
 src/commons/DBConcat.h                        |   17 +-
 src/commons/DBReader.cpp                      |   16 +-
 src/commons/DBReader.h                        |    1 +
 src/commons/DBWriter.cpp                      |    7 +-
 src/commons/DBWriter.h                        |   11 +-
 src/commons/ExpressionParser.cpp              |   11 +-
 src/commons/ExpressionParser.h                |    1 +
 src/commons/FileUtil.cpp                      |   36 +-
 src/commons/FileUtil.h                        |    4 +-
 src/commons/KSeqWrapper.cpp                   |   44 +-
 src/commons/KSeqWrapper.h                     |   11 +-
 src/commons/MultiParam.cpp                    |  112 +
 src/commons/MultiParam.h                      |   89 +
 src/commons/NucleotideMatrix.cpp              |   20 +-
 src/commons/Orf.cpp                           |   24 +-
 src/commons/Parameters.cpp                    |  815 ++-
 src/commons/Parameters.h                      |  110 +-
 src/commons/ScoreMatrixFile.cpp               |   64 -
 src/commons/ScoreMatrixFile.h                 |   35 -
 src/commons/Sequence.cpp                      |   82 +-
 src/commons/Sequence.h                        |   82 +-
 src/commons/SubstitutionMatrix.cpp            |   44 +-
 src/commons/SubstitutionMatrix.h              |    8 +-
 src/commons/SubstitutionMatrixProfileStates.h |   72 +-
 src/commons/Util.cpp                          |   54 +-
 src/commons/Util.h                            |    2 +
 src/commons/itoa.h                            |   35 +-
 src/linclust/KmerIndex.h                      |    7 +-
 src/linclust/LinsearchIndexReader.cpp         |    2 +-
 src/linclust/MarkovKmerScore.h                |    4 +-
 src/linclust/kmerindexdb.cpp                  |   48 +-
 src/linclust/kmermatcher.cpp                  |  666 +-
 src/linclust/kmermatcher.h                    |   65 +-
 src/linclust/kmersearch.cpp                   |  135 +-
 src/linclust/kmersearch.h                     |    7 +-
 src/multihit/MultiHitSearch.cpp               |   24 +-
 src/multihit/combinepvalperset.cpp            |   30 +-
 src/prefiltering/CacheFriendlyOperations.cpp  |  277 +-
 src/prefiltering/CacheFriendlyOperations.h    |   66 +-
 .../ExtendedSubstitutionMatrix.cpp            |   28 +-
 src/prefiltering/ExtendedSubstitutionMatrix.h |   13 +-
 src/prefiltering/IndexBuilder.cpp             |   44 +-
 src/prefiltering/IndexTable.h                 |   82 +-
 src/prefiltering/Indexer.cpp                  |    8 +-
 src/prefiltering/Indexer.h                    |   12 +-
 src/prefiltering/KmerGenerator.cpp            |    2 +-
 src/prefiltering/KmerGenerator.h              |    2 +-
 src/prefiltering/Main.cpp                     |    4 -
 src/prefiltering/Prefiltering.cpp             |  163 +-
 src/prefiltering/Prefiltering.h               |   10 +-
 src/prefiltering/PrefilteringIndexReader.cpp  |   17 +-
 src/prefiltering/PrefilteringIndexReader.h    |    2 +
 src/prefiltering/QueryMatcher.cpp             |  178 +-
 src/prefiltering/QueryMatcher.h               |  155 +-
 src/prefiltering/ReducedMatrix.cpp            |   56 +-
 src/prefiltering/ReducedMatrix.h              |   28 +-
 src/prefiltering/SequenceLookup.cpp           |    9 +-
 src/prefiltering/SequenceLookup.h             |    2 +-
 src/prefiltering/UngappedAlignment.cpp        |   10 +-
 src/prefiltering/ungappedprefilter.cpp        |    6 +-
 src/taxonomy/CMakeLists.txt                   |    2 +
 src/taxonomy/NcbiTaxonomy.cpp                 |   80 +-
 src/taxonomy/NcbiTaxonomy.h                   |   48 +-
 src/taxonomy/TaxonomyExpression.h             |  171 +-
 src/taxonomy/addtaxonomy.cpp                  |   88 +-
 src/taxonomy/aggregatetax.cpp                 |  172 +
 src/taxonomy/createtaxdb.cpp                  |    4 +-
 src/taxonomy/filtertaxdb.cpp                  |    9 +-
 src/taxonomy/filtertaxseqdb.cpp               |  112 +
 src/taxonomy/lca.cpp                          |   25 +-
 src/taxonomy/taxonomyreport.cpp               |    3 +-
 src/test/TestAlignment.cpp                    |   22 +-
 src/test/TestAlignmentPerformance.cpp         |    6 +-
 src/test/TestAlignmentTraceback.cpp           |   12 +-
 src/test/TestBestAlphabet.cpp                 |    4 +-
 src/test/TestCompositionBias.cpp              |    6 +-
 src/test/TestDiagonalScoring.cpp              |   32 +-
 src/test/TestDiagonalScoringPerformance.cpp   |    6 +-
 src/test/TestExtendedSubstitutionMatrix.cpp   |   12 +-
 src/test/TestIndexTable.cpp                   |    2 +-
 src/test/TestKmerGenerator.cpp                |    6 +-
 src/test/TestKmerGeneratorPerf.cpp            |    6 +-
 src/test/TestKmerGeneratorProfile.cpp         |    4 +-
 src/test/TestKmerNucl.cpp                     |    6 +-
 src/test/TestKmerScore.cpp                    |    4 +-
 src/test/TestKsw2.cpp                         |   12 +-
 src/test/TestMultipleAlignment.cpp            |   13 +-
 src/test/TestPSSM.cpp                         |   15 +-
 src/test/TestPSSMPrune.cpp                    |   17 +-
 src/test/TestProfileAlignment.cpp             |   18 +-
 src/test/TestProfileStates.cpp                |   16 +-
 src/test/TestReduceMatrix.cpp                 |   34 +-
 src/test/TestSequenceIndex.cpp                |    8 +-
 src/test/TestTanTan.cpp                       |   10 +-
 src/test/TestTaxExpr.cpp                      |   72 +-
 src/util/CMakeLists.txt                       |    2 +
 src/util/alignall.cpp                         |   35 +-
 src/util/alignbykmer.cpp                      |   60 +-
 src/util/apply.cpp                            |   10 +-
 src/util/clusthash.cpp                        |  223 +-
 src/util/convertalignments.cpp                |   43 +-
 src/util/convertmsa.cpp                       |    3 +
 src/util/convertprofiledb.cpp                 |    4 +-
 src/util/countkmer.cpp                        |   11 +-
 src/util/createdb.cpp                         |  123 +-
 src/util/createseqfiledb.cpp                  |  126 +-
 src/util/createsubdb.cpp                      |   30 +-
 src/util/diffseqdbs.cpp                       |    2 +-
 src/util/expandaln.cpp                        |   22 +-
 src/util/extractdomains.cpp                   |    6 +-
 src/util/extractframes.cpp                    |   24 +-
 src/util/extractorfs.cpp                      |    8 +-
 src/util/filterdb.cpp                         |  776 ++-
 src/util/filterdb.h                           |  121 -
 src/util/indexdb.cpp                          |    2 +-
 src/util/maskbygff.cpp                        |    2 +-
 src/util/masksequence.cpp                     |    6 +-
 src/util/mergeclusters.cpp                    |    2 +-
 src/util/msa2profile.cpp                      |   42 +-
 src/util/offsetalignment.cpp                  |   11 +-
 src/util/orftocontig.cpp                      |    2 +-
 src/util/profile2cs.cpp                       |    6 +-
 src/util/profile2pssm.cpp                     |    4 +-
 src/util/profile2seq.cpp                      |    4 +-
 src/util/result2dnamsa.cpp                    |  154 +
 src/util/result2flat.cpp                      |   28 +-
 src/util/result2msa.cpp                       |   30 +-
 src/util/result2pp.cpp                        |    5 +-
 src/util/result2profile.cpp                   |   20 +-
 src/util/reverseseq.cpp                       |   17 +-
 src/util/sortresult.cpp                       |    2 -
 src/util/splitsequence.cpp                    |    7 +-
 src/util/summarizealis.cpp                    |    2 +-
 src/util/summarizeresult.cpp                  |   20 +-
 src/util/summarizetabs.cpp                    |    6 +-
 src/util/swapresults.cpp                      |   28 +-
 src/util/tar2db.cpp                           |  244 +
 src/util/transitivealign.cpp                  |    6 +-
 src/util/translateaa.cpp                      |    4 +-
 src/util/translatenucs.cpp                    |    2 +-
 src/util/view.cpp                             |   29 +-
 src/version/CMakeLists.txt                    |    1 +
 src/workflow/CMakeLists.txt                   |    2 +
 src/workflow/Cluster.cpp                      |  124 +-
 src/workflow/ClusterUpdate.cpp                |   28 +-
 src/workflow/CreateIndex.cpp                  |   40 +-
 src/workflow/Databases.cpp                    |  256 +
 src/workflow/EasyCluster.cpp                  |   27 +-
 src/workflow/EasyLinclust.cpp                 |   24 +-
 src/workflow/EasyRbh.cpp                      |  106 +
 src/workflow/EasySearch.cpp                   |   27 +-
 src/workflow/EasyTaxonomy.cpp                 |   32 +
 src/workflow/Linclust.cpp                     |   19 +-
 src/workflow/Linsearch.cpp                    |   31 +-
 src/workflow/Map.cpp                          |   18 +-
 src/workflow/Rbh.cpp                          |   20 +-
 src/workflow/Search.cpp                       |   49 +-
 src/workflow/Taxonomy.cpp                     |   27 +-
 util/regression                               |    2 +-
 259 files changed, 21471 insertions(+), 5987 deletions(-)
 create mode 100644 data/resources/CMakeLists.txt
 rename data/{ => resources}/CovSeqidQscPercMinDiag.lib (100%)
 rename data/{ => resources}/CovSeqidQscPercMinDiagTargetCov.lib (100%)
 rename data/{ => resources}/ExpOpt3_8_polished.cs32.lib (100%)
 rename data/{ => resources}/Library255_may17.lib (100%)
 rename data/{ => resources}/cs219.lib (100%)
 rename data/{ => resources}/krona_prelude.html (100%)
 rename data/{ => resources}/libPolished_8.lib (100%)
 rename data/{ => resources}/libPure_blosum62_255.lib (100%)
 rename data/{ => resources}/libPure_blosum62_32.lib (100%)
 create mode 100644 data/workflow/CMakeLists.txt
 rename data/{ => workflow}/blastn.sh (87%)
 rename data/{ => workflow}/blastp.sh (99%)
 rename data/{ => workflow}/blastpgp.sh (92%)
 rename data/{ => workflow}/cascaded_clustering.sh (84%)
 rename data/{ => workflow}/clustering.sh (81%)
 rename data/{ => workflow}/createindex.sh (68%)
 rename data/{ => workflow}/createtaxdb.sh (54%)
 create mode 100644 data/workflow/databases.sh
 rename data/{ => workflow}/easycluster.sh (81%)
 create mode 100755 data/workflow/easyrbh.sh
 rename data/{ => workflow}/easysearch.sh (76%)
 rename data/{ => workflow}/easytaxonomy.sh (82%)
 rename data/{ => workflow}/enrich.sh (99%)
 rename data/{ => workflow}/linclust.sh (77%)
 rename data/{ => workflow}/linsearch.sh (94%)
 rename data/{ => workflow}/map.sh (100%)
 rename data/{ => workflow}/multihitdb.sh (99%)
 rename data/{ => workflow}/multihitsearch.sh (89%)
 create mode 100755 data/workflow/nucleotide_clustering.sh
 rename data/{ => workflow}/rbh.sh (84%)
 rename data/{ => workflow}/searchslicedtargetprofile.sh (91%)
 rename data/{ => workflow}/searchtargetprofile.sh (85%)
 rename data/{ => workflow}/taxonomy.sh (81%)
 rename data/{ => workflow}/translated_search.sh (91%)
 rename data/{ => workflow}/update_clustering.sh (93%)
 create mode 100644 lib/microtar/CMakeLists.txt
 create mode 100755 lib/microtar/LICENSE
 create mode 100755 lib/microtar/README.md
 create mode 100755 lib/microtar/microtar.c
 create mode 100755 lib/microtar/microtar.h
 create mode 100644 lib/nedmalloc/CMakeLists.txt
 create mode 100644 lib/nedmalloc/License.txt
 create mode 100644 lib/nedmalloc/Readme.txt
 create mode 100644 lib/nedmalloc/malloc.c.h
 create mode 100644 lib/nedmalloc/nedmalloc.c
 create mode 100644 lib/nedmalloc/nedmalloc.h
 create mode 100644 lib/simd/sse2altivec.h
 create mode 100644 lib/simd/sse2wasm.h
 create mode 100644 lib/xxhash/xxh3.h
 create mode 100644 lib/xxhash/xxhash.cpp
 create mode 100644 lib/xxhash/xxhash.h
 create mode 100644 src/commons/MultiParam.cpp
 create mode 100644 src/commons/MultiParam.h
 delete mode 100644 src/commons/ScoreMatrixFile.cpp
 delete mode 100644 src/commons/ScoreMatrixFile.h
 create mode 100644 src/taxonomy/aggregatetax.cpp
 create mode 100644 src/taxonomy/filtertaxseqdb.cpp
 delete mode 100644 src/util/filterdb.h
 create mode 100644 src/util/result2dnamsa.cpp
 create mode 100644 src/util/tar2db.cpp
 create mode 100644 src/workflow/Databases.cpp
 create mode 100644 src/workflow/EasyRbh.cpp

diff --git a/.gitattributes b/.gitattributes
index 76d0629..5b20177 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,3 +1,3 @@
-data/krona_prelude.html linguist-vendored
+data/resources/krona_prelude.html linguist-vendored
 lib/* linguist-vendored
 lib/simd linguist-vendored=false
diff --git a/.travis.yml b/.travis.yml
index 1bd82a3..e1509fe 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,102 +1,98 @@
-language: minimal
+language: shell
+os: linux
+dist: xenial
+addons:
+  apt:
+    packages: &default_packages
+    - cmake
+    - make
+    - zlib1g-dev
+    - libbz2-dev
+    - vim-common
+    - shellcheck
 
-env:
-
-matrix:
+jobs:
   include:
-  - os: linux
-    dist: xenial
-    addons:
+  - addons:
+      apt:
+        packages:
+        - *default_packages
+        - build-essential
+    arch: arm64
+  - addons:
       apt:
         packages:
-          - qemu-user-static
-          - binfmt-support
-    env: QEMU_ARM=1
-  - os: linux
-    dist: trusty
-    addons:
+          - *default_packages
+          - build-essential
+    arch: ppc64le
+  - addons:
       apt:
         packages:
-        - cmake
-        - ninja-build
-        - clang-3.6
+        - *default_packages
+        - clang-3.8
         - libc++-dev
-        - zlib1g-dev
-        - libbz2-dev
-        - vim-common
-        - shellcheck
-    env: CC=clang-3.6 CXX=clang++-3.6
-  - os: linux
-    dist: trusty
-    addons:
+        - libomp-dev
+    env: CC=clang-3.8 CXX=clang++-3.8
+  - addons:
       apt:
         sources:
         - ubuntu-toolchain-r-test
-        - llvm-toolchain-trusty-7
+        - sourceline: 'deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-10 main'
+          key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key'
         packages:
-        - cmake
-        - ninja-build
-        - clang-7
-        - libc++-7-dev
-        - libc++abi-7-dev
-        - zlib1g-dev
-        - libbz2-dev
-        - vim-common
-        - shellcheck
-    env: CC=clang-7 CXX=clang++-7
-  - os: linux
-    dist: trusty
-    addons:
+        - *default_packages
+        - clang-10
+        - libc++-10-dev
+        - libc++abi-10-dev
+        - libomp-10-dev
+    env: CC=clang-10 CXX=clang++-10
+  - addons:
       apt:
+        sources:
+        - ubuntu-toolchain-r-test
+        - sourceline: 'deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-10 main'
+          key_url: 'https://apt.llvm.org/llvm-snapshot.gpg.key'
         packages:
-        - cmake
-        - ninja-build
+        - *default_packages
+        - clang-10
+        - libc++-10-dev
+        - libc++abi-10-dev
+        - libomp-10-dev
+        - mpi-default-dev
+        - mpi-default-bin
+    env: MPI=1 CC=clang-10 CXX=clang++-10
+  - addons:
+      apt:
+        sources:
+        - ubuntu-toolchain-r-test
+        packages:
+        - *default_packages
         - gcc-4.8
         - g++-4.8
-        - zlib1g-dev
-        - libbz2-dev
-        - vim-common
-        - shellcheck
     env: CC=gcc-4.8 CXX=g++-4.8
-  - os: linux
-    dist: trusty
-    addons:
+  - addons:
       apt:
         sources:
         - ubuntu-toolchain-r-test
         packages:
-        - cmake
-        - ninja-build
-        - gcc-8
-        - g++-8
-        - zlib1g-dev
-        - libbz2-dev
-        - vim-common
-        - shellcheck
-    env: CC=gcc-8 CXX=g++-8
-  - os: linux
-    dist: trusty
-    addons:
+        - *default_packages
+        - gcc-9
+        - g++-9
+    env: CC=gcc-9 CXX=g++-9
+  - addons:
       apt:
         sources:
         - ubuntu-toolchain-r-test
         packages:
-        - cmake
-        - ninja-build
-        - gcc-8
-        - g++-8
-        - zlib1g-dev
-        - libbz2-dev
-        - vim-common
-        - libopenmpi-dev
-        - shellcheck
-    env: MPI=1 CC=gcc-8 CXX=g++-8
-  allow_failures:
-  - env: QEMU_ARM=1
+        - *default_packages
+        - gcc-9
+        - g++-9
+        - mpi-default-dev
+        - mpi-default-bin
+    env: MPI=1 CC=gcc-9 CXX=g++-9
   fast_finish: true
-
-services:
-  - docker
+  allow_failures:
+  - arch: ppc64le
 
 before_install:
   - export CC
@@ -104,14 +100,12 @@ before_install:
 
 script:
   - |
-    if [[ -n "$QEMU_ARM" ]]; then \
-      docker build --build-arg NAMESPACE=arm64v8/ -t mmseqs . || exit 1; \
-    elif [[ "$TRAVIS_OS_NAME" == "linux" ]]; then \
-      if [[ -n "$MPI" ]]; then MPI=1; else MPI=0; fi; \
-      mkdir build; cd build; \
-      cmake -G Ninja -DENABLE_WERROR=1 -DHAVE_MPI="$MPI" -DHAVE_SSE4_1=1 -DHAVE_TESTS=1 -DREQUIRE_OPENMP=0 .. \
-        || exit 1; ninja || exit 1; \
-    else \
-      exit 1; \
-    fi
-
+    mkdir build; cd build; \
+    cmake -DHAVE_MPI="$([[ -z "$MPI" ]]; echo $?)" -DENABLE_WERROR=1 -DHAVE_TESTS=1 ..; \
+    make -j $(nproc --all); \
+    mkdir path; \
+    printf '#!/bin/sh\n/usr/bin/tee "$@" | tail\n' > path/tee; \
+    chmod +x path/tee; \
+    export PATH="$(readlink -f path):$PATH"; \
+    export TTY=0; \
+    ../util/regression/run_regression.sh ./src/mmseqs SCRATCH;
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 45bc04e..d381659 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -38,14 +38,25 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
     set(CMAKE_COMPILER_IS_ICC 1)
 endif ()
 
-# set flags
+# see https://wiki.debian.org/ArchitectureSpecificsMemo for char signedness
 set(MMSEQS_CXX_FLAGS "-std=c++0x")
-if (NOT ${HAVE_NEON})
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
+    set(ARM 1)
+    set(MMSEQS_CXX_FLAGS "${MMSEQS_CXX_FLAGS} -DNEON=1 -DSSE=1 -fsigned-char -mfpu=neon")
+elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
+    set(ARM 1)
+    set(MMSEQS_CXX_FLAGS "${MMSEQS_CXX_FLAGS} -DNEON=1 -DSSE=1 -fsigned-char")
+elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "PPC64*|ppc64*|powerpc64*")
+    set(PPC64 1)
+    set(MMSEQS_CXX_FLAGS "${MMSEQS_CXX_FLAGS} -DSSE=1 -fsigned-char -std=gnu++0x -mcpu=power8 -mvsx")
+elseif (EMSCRIPTEN)
+    set(MMSEQS_CXX_FLAGS "${MMSEQS_CXX_FLAGS} -DSSE=1 -DWASM=1 -msimd128 -s WASM=1 -s ASSERTIONS=1")
+else ()
     set(MMSEQS_CXX_FLAGS "${MMSEQS_CXX_FLAGS} -m64")
 endif ()
 
 # Compiler-specific features
-if (CMAKE_COMPILER_IS_CLANG)
+if (CMAKE_COMPILER_IS_CLANG AND (NOT EMSCRIPTEN))
     set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LANGUAGE_STANDARD "c++11")
     set(CMAKE_XCODE_ATTRIBUTE_CLANG_CXX_LIBRARY "libc++")
     set(MMSEQS_CXX_FLAGS "${MMSEQS_CXX_FLAGS} -stdlib=libc++")
@@ -83,12 +94,23 @@ add_subdirectory(lib/zstd/build/cmake/lib EXCLUDE_FROM_ALL)
 include_directories(lib/tinyexpr)
 add_subdirectory(lib/tinyexpr EXCLUDE_FROM_ALL)
 
+# microtar
+include_directories(lib/microtar)
+add_subdirectory(lib/microtar)
+
+# default cygwin allocator (dlmalloc) locks on every allocation and destroys MT performance
+if (CYGWIN)
+    add_subdirectory(lib/nedmalloc)
+endif()
+
 include_directories(lib)
 include_directories(lib/simd)
 include_directories(lib/gzstream)
 include_directories(lib/alp)
 include_directories(lib/cacode)
 include_directories(lib/ksw2)
+include_directories(lib/xxhash)
+
 add_subdirectory(lib/cacode)
 add_subdirectory(lib/alp)
 add_subdirectory(lib/ksw2)
diff --git a/Dockerfile b/Dockerfile
index 8477fcd..3b9163b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,17 +1,22 @@
 ARG NAMESPACE=
 FROM debian:stable-slim as qemu-downloader
 ARG NAMESPACE
-RUN apt-get update && apt-get install -y wget && rm -rf /var/lib/apt/lists/*
-RUN if [ X"$NAMESPACE" = X"arm64v8/" ]; then \
-      wget -nv -O "/usr/bin/qemu-aarch64-static" https://github.com/multiarch/qemu-user-static/releases/download/v3.1.0-2/qemu-aarch64-static; \
-    else \
-      echo -e '#!/bin/sh\n"$@"\n' > "/usr/bin/qemu-aarch64-static"; \
+RUN if [ X"$NAMESPACE" != X"" ]; then \
+        apt-get update && apt-get install -y wget && rm -rf /var/lib/apt/lists/*; \
     fi; \
-    chmod +x /usr/bin/qemu-aarch64-static;
+    if [ X"$NAMESPACE" = X"ppc64le/" ]; then \
+        wget -nv -O /usr/bin/qemu-ppc64le-static https://github.com/multiarch/qemu-user-static/releases/download/v4.2.0-4/qemu-ppc64le-static; \
+        chmod +x /usr/bin/qemu-ppc64le-static; \
+    fi; \
+    if [ X"$NAMESPACE" = X"aarch64/" ]; then \
+        wget -nv -O /usr/bin/qemu-aarch64-static https://github.com/multiarch/qemu-user-static/releases/download/v4.2.0-4/qemu-aarch64-static; \
+        chmod +x /usr/bin/qemu-aarch64-static; \
+    fi; \
+    touch /usr/bin/dummy_copy
 
-FROM ${NAMESPACE}debian:stable-slim as mmseqs-builder
+FROM ${NAMESPACE}debian:stable-slim as builder
 ARG NAMESPACE
-COPY --from=qemu-downloader /usr/bin/qemu-aarch64-static /usr/bin/qemu-aarch64-static
+COPY --from=qemu-downloader /usr/bin/dummy_copy /usr/bin/qemu-aarch64-static* /usr/bin/qemu-ppc64le-static* /usr/bin/
 
 RUN apt-get update && apt-get install -y \
     build-essential cmake xxd git zlib1g-dev libbz2-dev \
@@ -19,45 +24,38 @@ RUN apt-get update && apt-get install -y \
 
 WORKDIR /opt/mmseqs
 ADD . .
-RUN mkdir -p build_sse/bin && mkdir -p build_avx/bin && mkdir -p build_neon/bin
-
-WORKDIR /opt/mmseqs/build_sse
-RUN if [ X"$NAMESPACE" = X"" ]; then \
-      cmake -DHAVE_SSE4_1=1 -DHAVE_MPI=0 -DHAVE_TESTS=0 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=. ..; \
-      make -j $(nproc --all) && make install; \
-    fi
-
-WORKDIR /opt/mmseqs/build_avx
-RUN if [ X"$NAMESPACE" = X"" ]; then \
-      cmake -DHAVE_AVX2=1 -DHAVE_MPI=0 -DHAVE_TESTS=0 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=. ..; \
-      make -j $(nproc --all) && make install; \
-    fi
-
-WORKDIR /opt/mmseqs/build_neon
-RUN if [ X"$NAMESPACE" = X"arm64v8/" ]; then \
-      cmake  -DHAVE_NEON=1 -DHAVE_MPI=0 -DHAVE_TESTS=0 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=. ..; \
-      make -j $(nproc --all) && make install; \
-      touch /opt/mmseqs/build_sse/bin/mmseqs; \
-      touch /opt/mmseqs/build_avx/bin/mmseqs; \
-    else \
-      touch /opt/mmseqs/build_neon/bin/mmseqs; \
-    fi
+
+RUN mkdir -p build_sse/src && mkdir -p build_avx/src && mkdir -p build/src; \
+    if [ X"$NAMESPACE" = X"" ]; then \
+       cd /opt/mmseqs/build_sse; \
+       cmake -DHAVE_SSE4_1=1 -DHAVE_MPI=0 -DHAVE_TESTS=0 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=. ..; \
+       make -j $(nproc --all); \
+       mv src/mmseqs /opt/mmseqs/mmseqs_sse42; \
+       cd /opt/mmseqs/build_avx; \
+       cmake -DHAVE_AVX2=1 -DHAVE_MPI=0 -DHAVE_TESTS=0 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=. ..; \
+       make -j $(nproc --all); \
+       mv src/mmseqs /opt/mmseqs/mmseqs_avx2; \
+       touch /opt/mmseqs/mmseqs_arch; \
+     else \
+       cd /opt/mmseqs/build; \
+       cmake -DHAVE_MPI=0 -DHAVE_TESTS=0 -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=. ..; \
+       make -j $(nproc --all); \
+       mv src/mmseqs /opt/mmseqs/mmseqs_arch; \
+       touch /opt/mmseqs/mmseqs_sse42 /opt/mmseqs/mmseqs_avx2; \
+     fi
 
 FROM ${NAMESPACE}debian:stable-slim
 ARG NAMESPACE
 MAINTAINER Milot Mirdita <milot@mirdita.de>
-COPY --from=qemu-downloader /usr/bin/qemu-aarch64-static /usr/bin/qemu-aarch64-static
+COPY --from=qemu-downloader /usr/bin/dummy_copy /usr/bin/qemu-aarch64-static* /usr/bin/qemu-ppc64le-static* /usr/bin/
 
 RUN apt-get update && apt-get install -y \
-      gawk bash grep libstdc++6 libgomp1 zlib1g libbz2-1.0 \
+      gawk bash grep libstdc++6 libgomp1 zlib1g libbz2-1.0 wget tar \
     && rm -rf /var/lib/apt/lists/*
 
-COPY --from=mmseqs-builder /opt/mmseqs/build_sse/bin/mmseqs /usr/local/bin/mmseqs_sse42
-COPY --from=mmseqs-builder /opt/mmseqs/build_avx/bin/mmseqs /usr/local/bin/mmseqs_avx2
-COPY --from=mmseqs-builder /opt/mmseqs/build_neon/bin/mmseqs /usr/local/bin/mmseqs_neon
+COPY --from=builder /opt/mmseqs/mmseqs_arch /opt/mmseqs/mmseqs_sse42 /opt/mmseqs/mmseqs_avx2 /usr/local/bin/
 ADD util/mmseqs_wrapper.sh /usr/local/bin/mmseqs
-
-RUN if [ X"$NAMESPACE" = X"arm64v8/" ]; then mv -f /usr/local/bin/mmseqs_neon /usr/local/bin/mmseqs; fi
+RUN if [ X"$NAMESPACE" != X"" ]; then mv -f /usr/local/bin/mmseqs_arch /usr/local/bin/mmseqs; fi
 
 CMD ["/usr/local/bin/mmseqs"]
 
diff --git a/README.md b/README.md
index 264af27..f421ddf 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# MMseqs2: ultra fast and sensitive protein search and clustering suite
+# MMseqs2: ultra fast and sensitive sequence search and clustering suite
 MMseqs2 (Many-against-Many sequence searching) is a software suite to search and cluster huge protein and nucleotide sequence sets. MMseqs2 is open source GPL-licensed software implemented in C++ for Linux, MacOS, and (as beta version, via cygwin) Windows. The software is designed to run on multiple cores and servers and exhibits very good scalability. MMseqs2 can run 10000 times faster than BLAST. At 100 times its speed it achieves almost the same sensitivity. It can perform profile searches with the same sensitivity as PSI-BLAST at over 400 times its speed.
 
 ##  Publications
@@ -14,14 +14,13 @@ MMseqs2 (Many-against-Many sequence searching) is a software suite to search and
 [![Docker Pulls](https://img.shields.io/docker/pulls/soedinglab/mmseqs2.svg)](https://hub.docker.com/r/soedinglab/mmseqs2)
 [![Build Status](https://dev.azure.com/themartinsteinegger/mmseqs2/_apis/build/status/soedinglab.MMseqs2?branchName=master)](https://dev.azure.com/themartinsteinegger/mmseqs2/_build/latest?definitionId=2&branchName=master)
 [![Travis CI](https://travis-ci.org/soedinglab/MMseqs2.svg?branch=master)](https://travis-ci.org/soedinglab/MMseqs2)
-[![Zenodo DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.840208.svg)](https://zenodo.org/record/1718312)
+<a href="https://chat.mmseqs.com/"><img src="https://chat.mmseqs.com/api/v1/shield.svg?type=online&name=chat&icon=false" /></a>
 
 <p align="center"><img src="https://raw.githubusercontent.com/soedinglab/mmseqs2/master/.github/mmseqs2_logo.png" height="256" /></p>
 
 
 ## Documentation
-The MMseqs2 user guide is available in our [GitHub Wiki](https://github.com/soedinglab/mmseqs2/wiki) or as a [PDF file](https://mmseqs.com/latest/userguide.pdf) (Thanks to [pandoc](https://github.com/jgm/pandoc)!). The wiki also contains [tutorials](https://github.com/soedinglab/MMseqs2/wiki/Tutorials) to learn how to use MMseqs2 with real data.
-
+The MMseqs2 user guide is available in our [GitHub Wiki](https://github.com/soedinglab/mmseqs2/wiki) or as a [PDF file](https://mmseqs.com/latest/userguide.pdf) (Thanks to [pandoc](https://github.com/jgm/pandoc)!). The wiki also contains [tutorials](https://github.com/soedinglab/MMseqs2/wiki/Tutorials) to learn how to use MMseqs2 with real data. For questions please open an issue on [GitHub](https://github.com/soedinglab/MMseqs2/issues) or ask in our [chat](https://chat.mmseqs.com). 
 Keep posted about MMseqs2/Linclust updates by following Martin on [Twitter](https://twitter.com/thesteinegger).
 
 ## Installation
@@ -30,7 +29,7 @@ MMseqs2 can be used by compiling from source, downloading a statically compiled
      # install by brew
      brew install mmseqs2
      # install via conda
-     conda install -c bioconda mmseqs2
+     conda install -c conda-forge -c bioconda mmseqs2
      # install docker
      docker pull soedinglab/mmseqs2
      # static build with SSE4.1
@@ -39,7 +38,7 @@ MMseqs2 can be used by compiling from source, downloading a statically compiled
      wget https://mmseqs.com/latest/mmseqs-linux-avx2.tar.gz; tar xvfz mmseqs-linux-avx2.tar.gz; export PATH=$(pwd)/mmseqs/bin/:$PATH
 
 The AVX2 version is faster than SSE4.1, check if AVX2 is supported by executing `cat /proc/cpuinfo | grep avx2` on Linux and `sysctl -a | grep machdep.cpu.leaf7_features | grep AVX2` on MacOS).
-We also provide static binaries for MacOS and Windows at [mmseqs.com/latest](https://mmseqs.com/latest).
+We also provide static binaries for all supported platforms at [mmseqs.com/latest](https://mmseqs.com/latest).
 
 MMseqs2 comes with a bash command and parameter auto completion, which can be activated by adding the following lines to your $HOME/.bash_profile:
 
@@ -85,29 +84,34 @@ Please adjust the [clustering criteria](https://github.com/soedinglab/MMseqs2/wi
 
 ### Search
          
-The `easy-search` searches directly with a FASTA/FASTQ files against either another FASTA/FASTQ file or an already existing MMseqs2 database.
+The `easy-search` workflow searches directly with a FASTA/FASTQ files against either another FASTA/FASTQ file or an already existing MMseqs2 database.
         
-        mmseqs easy-search examples/QUERY.fasta DB.fasta alnRes tmp
+        mmseqs easy-search examples/QUERY.fasta examples/DB.fasta alnRes.m8 tmp
  
-It is also possible to pre-compute the index for the target database:
+It is also possible to pre-compute the index for the target database. This reduces overhead when searching repeatedly against the same database.
 
         mmseqs createdb examples/DB.fasta targetDB
         mmseqs createindex targetDB tmp
-        mmseqs easy-search examples/QUERY.fasta targetDB alnRes tmp
+        mmseqs easy-search examples/QUERY.fasta targetDB alnRes.m8 tmp
+        
+The `databases` workflow provides download and setup procedures for many public reference databases, such as the Uniref, NR, NT, PFAM and many more (see [Downloading databases](https://github.com/soedinglab/mmseqs2/wiki#downloading-databases)). For example, to download and search against a database containing the Swiss-Prot reference proteins run: 
+
+        mmseqs databases UniProtKB/Swiss-Prot swissprot tmp
+        mmseqs easy-search examples/QUERY.fasta swissprot alnRes.m8 tmp
         
 The speed and sensitivity of the `search` can be adjusted with `-s` parameter and should be adapted based on your use case (see [setting sensitivity -s parameter](https://github.com/soedinglab/mmseqs2/wiki#set-sensitivity--s-parameter)). A very fast search would use a sensitivity of `-s 1.0`, while a very sensitive search would use a sensitivity of up to `-s 7.0`. A detailed guide how to speed up searches is [here](https://github.com/soedinglab/MMseqs2/wiki#how-to-control-the-speed-of-the-search).
 
 The output can be customized with the `--format-output` option e.g. `--format-output "query,target,qaln,taln"` returns the query and target accession and the pairwise alignments in tab separated format. You can choose many different [output columns](https://github.com/soedinglab/mmseqs2/wiki#custom-alignment-format-with-convertalis).
 
 ### Taxonomy
-The `easy-taxonomy` workflow can be used assign sequences taxonomical labels. It performs a search against a target sequence databases and computes the lowest common ancestor of all equal scoring top hits (default). Other assignment options are available through `--lca-mode`.
+The `easy-taxonomy` workflow can be used to assign sequences taxonomical labels. It performs a search against a sequence database with taxonomy information (seqTaxDb), chooses the most representative sets of aligned target sequences according to different strategies (according to `--lca-mode`) and computes the lowest common ancestor among those.
 
         mmseqs createdb examples/DB.fasta targetDB
         mmseqs createtaxdb targetDB tmp
         mmseqs createindex targetDB tmp
         mmseqs easy-taxonomy examples/QUERY.fasta targetDB alnRes tmp
 
-In default `createtaxdb` assigns every sequence with a Uniprot accession to a taxonomical identifier and downloads the NCBI taxonomy. We also support [BLAST](https://github.com/soedinglab/MMseqs2/wiki#create-a-sequence-database-with-taxonomic-information-from-an-existing-blast-database), [SILVA](https://github.com/soedinglab/MMseqs2/wiki#create-a-sequence-database-with-taxonomic-information-for-silva) or [custom taxonomical](https://github.com/soedinglab/MMseqs2/wiki#manually-annotate-a-sequence-database-with-taxonomic-information) databases.
+By default, `createtaxdb` assigns a Uniprot accession to a taxonomical identifier to every sequence and downloads the NCBI taxonomy. We also support [BLAST](https://github.com/soedinglab/MMseqs2/wiki#create-a-sequence-database-with-taxonomic-information-from-an-existing-blast-database), [SILVA](https://github.com/soedinglab/MMseqs2/wiki#create-a-sequence-database-with-taxonomic-information-for-silva) or [custom taxonomical](https://github.com/soedinglab/MMseqs2/wiki#manually-annotate-a-sequence-database-with-taxonomic-information) databases. Many common taxonomic reference databases can be easily downloaded and set up by the [`databases` workflow](https://github.com/soedinglab/mmseqs2/wiki#downloading-databases).
 
 Read more about the [taxonomy format](https://github.com/soedinglab/MMseqs2/wiki#taxonomy-format) and the [classification](https://github.com/soedinglab/MMseqs2/wiki#taxonomy-assignment-using-mmseqs-taxonomy) in our user guide.
 
@@ -140,3 +144,9 @@ To search with multiple servers, call the `search` or `cluster` workflow with th
 
         RUNNER="mpirun -pernode -np 42" mmseqs search queryDB targetDB resultDB tmp
 
+## Contributors
+
+MMseqs2 exists thanks to all the people who contribute. 
+<a href="https://github.com/soedinglab/mmseqs2/graphs/contributors">
+  <img src="https://contributors-img.firebaseapp.com/image?repo=soedinglab/mmseqs2" />
+</a>
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 9df46ea..c0a54b9 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -109,7 +109,7 @@ jobs:
           export TTY=0
           if [ "${BUILD_TYPE}" = "ASan" ]; then
             echo "leak:libgomp1" > ${BUILD_SOURCESDIRECTORY}/ASan.supp
-            export export ASAN_OPTIONS=suppressions=${BUILD_SOURCESDIRECTORY}/ASan.supp
+            export ASAN_OPTIONS=suppressions=${BUILD_SOURCESDIRECTORY}/ASan.supp
           fi
           ${BUILD_SOURCESDIRECTORY}/util/regression/run_regression.sh ${BUILD_SOURCESDIRECTORY}/build/src/mmseqs ${BUILD_SOURCESDIRECTORY}/regression
         displayName: Run Regression Suite
diff --git a/cmake/MMseqsResourceCompiler.cmake b/cmake/MMseqsResourceCompiler.cmake
index 09b973c..c1c5e65 100644
--- a/cmake/MMseqsResourceCompiler.cmake
+++ b/cmake/MMseqsResourceCompiler.cmake
@@ -34,14 +34,16 @@ endif()
 
 function(compile_resource INPUT_FILE OUTPUT_FILE)
     get_filename_component(INPUT_FILE_NAME ${PROJECT_SOURCE_DIR}/data/${INPUT_FILE} NAME)
-    set(OUTPUT_FILE ${PROJECT_BINARY_DIR}/generated/${INPUT_FILE_NAME}.h PARENT_SCOPE)
-    add_custom_command(OUTPUT ${PROJECT_BINARY_DIR}/generated/${INPUT_FILE_NAME}.h
-            COMMAND ${compile_resource__internal_dir}/checkshell.sh ${SHELLCHECK_EXECUTABLE} ${INPUT_FILE}
+    get_filename_component(INPUT_FILE_DIRECTORY ${PROJECT_SOURCE_DIR}/data/${INPUT_FILE} DIRECTORY)
+    set(OUTPUT_FILE ${PROJECT_BINARY_DIR}/generated/${INPUT_FILE_NAME}.h)
+    set(OUTPUT_FILE ${OUTPUT_FILE} PARENT_SCOPE)
+    add_custom_command(OUTPUT ${OUTPUT_FILE}
+            COMMAND ${compile_resource__internal_dir}/checkshell.sh ${SHELLCHECK_EXECUTABLE} ${INPUT_FILE_NAME}
             COMMAND mkdir -p ${PROJECT_BINARY_DIR}/generated
-            COMMAND ${XXD_EXECUTABLE} ${XXD_PARAMS} ${INPUT_FILE} > ${PROJECT_BINARY_DIR}/generated/${INPUT_FILE_NAME}.h
-            COMMAND ${SED_EXECUTABLE} 's!unsigned char!static const unsigned char!' < ${PROJECT_BINARY_DIR}/generated/${INPUT_FILE_NAME}.h > ${PROJECT_BINARY_DIR}/generated/${INPUT_FILE_NAME}.h.tmp
-            COMMAND mv -f ${PROJECT_BINARY_DIR}/generated/${INPUT_FILE_NAME}.h.tmp ${PROJECT_BINARY_DIR}/generated/${INPUT_FILE_NAME}.h
-            WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/data/
+            COMMAND ${XXD_EXECUTABLE} ${XXD_PARAMS} ${INPUT_FILE_NAME} > ${OUTPUT_FILE}
+            COMMAND ${SED_EXECUTABLE} 's!unsigned char!static const unsigned char!' < ${OUTPUT_FILE} > ${OUTPUT_FILE}.tmp
+            COMMAND mv -f ${OUTPUT_FILE}.tmp ${OUTPUT_FILE}
+            WORKING_DIRECTORY ${INPUT_FILE_DIRECTORY}
             DEPENDS ${PROJECT_SOURCE_DIR}/data/${INPUT_FILE})
-    set_source_files_properties(${PROJECT_BINARY_DIR}/generated/${INPUT_FILE_NAME}.h PROPERTIES GENERATED TRUE)
+    set_source_files_properties(${OUTPUT_FILE} PROPERTIES GENERATED TRUE)
 endfunction()
diff --git a/cmake/MMseqsSetupDerivedTarget.cmake b/cmake/MMseqsSetupDerivedTarget.cmake
index 2f1803e..a5a85fa 100644
--- a/cmake/MMseqsSetupDerivedTarget.cmake
+++ b/cmake/MMseqsSetupDerivedTarget.cmake
@@ -6,7 +6,7 @@ function (mmseqs_setup_derived_target TARGET)
     get_target_property(DEF_TMP mmseqs-framework COMPILE_DEFINITIONS)
     get_target_property(INCL_TMP mmseqs-framework INCLUDE_DIRECTORIES)
 
-    target_link_libraries(${TARGET} mmseqs-framework libzstd_static)
+    target_link_libraries(${TARGET} mmseqs-framework)
     append_target_property(${TARGET} COMPILE_FLAGS ${COMPILE_TMP})
     append_target_property(${TARGET} LINK_FLAGS ${LINK_TMP})
     set_property(TARGET ${TARGET} APPEND PROPERTY COMPILE_DEFINITIONS ${DEF_TMP})
diff --git a/data/CMakeLists.txt b/data/CMakeLists.txt
index 52cdfc9..577dfff 100644
--- a/data/CMakeLists.txt
+++ b/data/CMakeLists.txt
@@ -1,46 +1,18 @@
 include(MMseqsResourceCompiler)
 
-set(COMPILED_RESOURCES
-        easysearch.sh
-        easycluster.sh
-        easytaxonomy.sh
-        blastp.sh
-        blastpgp.sh
-        map.sh
-        rbh.sh
-        linclust.sh
-        clustering.sh
-        cascaded_clustering.sh
-        update_clustering.sh
-        searchtargetprofile.sh
-        createindex.sh
-        createtaxdb.sh
-        translated_search.sh
-        taxonomy.sh
-        multihitdb.sh
-        multihitsearch.sh
-        enrich.sh
-        blastn.sh
+
+add_subdirectory(resources)
+add_subdirectory(workflow)
+set(GENERATED_MATRICES
         VTML80.out
         VTML40.out
         nucleotide.out
         blosum62.out
         PAM30.out
-        CovSeqidQscPercMinDiag.lib
-        CovSeqidQscPercMinDiagTargetCov.lib
-        ExpOpt3_8_polished.cs32.lib
-        Library255_may17.lib
-        libPure_blosum62_255.lib
-        libPure_blosum62_32.lib
-        libPolished_8.lib
-        searchslicedtargetprofile.sh
-        cs219.lib
-        linsearch.sh
-        krona_prelude.html
         )
 
 set(GENERATED_OUTPUT_HEADERS "")
-FOREACH(INPUT_FILE ${COMPILED_RESOURCES})
+FOREACH(INPUT_FILE ${GENERATED_MATRICES} ${GENERATED_RESOURCES} ${GENERATED_WORKFLOWS})
     compile_resource(${INPUT_FILE} OUTPUT_FILE)
     list(APPEND GENERATED_OUTPUT_HEADERS "${OUTPUT_FILE}")
 ENDFOREACH()
diff --git a/data/resources/CMakeLists.txt b/data/resources/CMakeLists.txt
new file mode 100644
index 0000000..4b39c63
--- /dev/null
+++ b/data/resources/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(GENERATED_RESOURCES
+        resources/CovSeqidQscPercMinDiag.lib
+        resources/CovSeqidQscPercMinDiagTargetCov.lib
+        resources/ExpOpt3_8_polished.cs32.lib
+        resources/Library255_may17.lib
+        resources/libPolished_8.lib
+        resources/cs219.lib
+        resources/krona_prelude.html
+
+        PARENT_SCOPE
+        )
diff --git a/data/CovSeqidQscPercMinDiag.lib b/data/resources/CovSeqidQscPercMinDiag.lib
similarity index 100%
rename from data/CovSeqidQscPercMinDiag.lib
rename to data/resources/CovSeqidQscPercMinDiag.lib
diff --git a/data/CovSeqidQscPercMinDiagTargetCov.lib b/data/resources/CovSeqidQscPercMinDiagTargetCov.lib
similarity index 100%
rename from data/CovSeqidQscPercMinDiagTargetCov.lib
rename to data/resources/CovSeqidQscPercMinDiagTargetCov.lib
diff --git a/data/ExpOpt3_8_polished.cs32.lib b/data/resources/ExpOpt3_8_polished.cs32.lib
similarity index 100%
rename from data/ExpOpt3_8_polished.cs32.lib
rename to data/resources/ExpOpt3_8_polished.cs32.lib
diff --git a/data/Library255_may17.lib b/data/resources/Library255_may17.lib
similarity index 100%
rename from data/Library255_may17.lib
rename to data/resources/Library255_may17.lib
diff --git a/data/cs219.lib b/data/resources/cs219.lib
similarity index 100%
rename from data/cs219.lib
rename to data/resources/cs219.lib
diff --git a/data/krona_prelude.html b/data/resources/krona_prelude.html
similarity index 100%
rename from data/krona_prelude.html
rename to data/resources/krona_prelude.html
diff --git a/data/libPolished_8.lib b/data/resources/libPolished_8.lib
similarity index 100%
rename from data/libPolished_8.lib
rename to data/resources/libPolished_8.lib
diff --git a/data/libPure_blosum62_255.lib b/data/resources/libPure_blosum62_255.lib
similarity index 100%
rename from data/libPure_blosum62_255.lib
rename to data/resources/libPure_blosum62_255.lib
diff --git a/data/libPure_blosum62_32.lib b/data/resources/libPure_blosum62_32.lib
similarity index 100%
rename from data/libPure_blosum62_32.lib
rename to data/resources/libPure_blosum62_32.lib
diff --git a/data/workflow/CMakeLists.txt b/data/workflow/CMakeLists.txt
new file mode 100644
index 0000000..1f7c7ba
--- /dev/null
+++ b/data/workflow/CMakeLists.txt
@@ -0,0 +1,28 @@
+set(GENERATED_WORKFLOWS
+        workflow/easysearch.sh
+        workflow/easycluster.sh
+        workflow/easytaxonomy.sh
+        workflow/easyrbh.sh
+        workflow/blastp.sh
+        workflow/blastpgp.sh
+        workflow/map.sh
+        workflow/rbh.sh
+        workflow/linclust.sh
+        workflow/clustering.sh
+        workflow/cascaded_clustering.sh
+        workflow/update_clustering.sh
+        workflow/searchtargetprofile.sh
+        workflow/createindex.sh
+        workflow/createtaxdb.sh
+        workflow/translated_search.sh
+        workflow/taxonomy.sh
+        workflow/multihitdb.sh
+        workflow/multihitsearch.sh
+        workflow/enrich.sh
+        workflow/blastn.sh
+        workflow/searchslicedtargetprofile.sh
+        workflow/linsearch.sh
+        workflow/databases.sh
+        workflow/nucleotide_clustering.sh
+        PARENT_SCOPE
+        )
diff --git a/data/blastn.sh b/data/workflow/blastn.sh
similarity index 87%
rename from data/blastn.sh
rename to data/workflow/blastn.sh
index 2926f91..97330dc 100755
--- a/data/blastn.sh
+++ b/data/workflow/blastn.sh
@@ -67,10 +67,13 @@ fi
 
 
 if [ -n "$REMOVE_TMP" ]; then
-    echo "Remove temporary files"
-    "$MMSEQS" rmdb "$4/q_orfs"
-    "$MMSEQS" rmdb "$4/q_orfs_aa"
-    "$MMSEQS" rmdb "$4/t_orfs"
-    "$MMSEQS" rmdb "$4/t_orfs_aa"
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "$4/q_orfs" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "$4/q_orfs_aa" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "$4/t_orfs" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "$4/t_orfs_aa" ${VERBOSITY}
 fi
 
diff --git a/data/blastp.sh b/data/workflow/blastp.sh
similarity index 99%
rename from data/blastp.sh
rename to data/workflow/blastp.sh
index 63af17c..5722cfb 100755
--- a/data/blastp.sh
+++ b/data/workflow/blastp.sh
@@ -99,7 +99,6 @@ while [ "$STEP" -lt "$STEPS" ]; do
 done
 
 if [ -n "$REMOVE_TMP" ]; then
-    echo "Remove temporary files"
     STEP=0
     while [ "$STEP" -lt "$STEPS" ]; do
         # shellcheck disable=SC2086
diff --git a/data/blastpgp.sh b/data/workflow/blastpgp.sh
similarity index 92%
rename from data/blastpgp.sh
rename to data/workflow/blastpgp.sh
index 86bbb77..beb12ae 100755
--- a/data/blastpgp.sh
+++ b/data/workflow/blastpgp.sh
@@ -102,12 +102,14 @@ while [ $STEP -lt $NUM_IT ]; do
 done
 
 if [ -n "$REMOVE_TMP" ]; then
-    echo "Remove temporary files"
     STEP=0
     while [ "$STEP" -lt "$NUM_IT" ]; do
-        "$MMSEQS" rmdb "${TMP_PATH}/pref_$STEP"
-        "$MMSEQS" rmdb "${TMP_PATH}/aln_$STEP"
-        "$MMSEQS" rmdb "${TMP_PATH}/profile_$STEP"
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/pref_$STEP" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/aln_$STEP" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/profile_$STEP" ${VERBOSITY}
         STEP=$((STEP+1))
     done
     rm -f "$TMP_PATH/blastpgp.sh"
diff --git a/data/cascaded_clustering.sh b/data/workflow/cascaded_clustering.sh
similarity index 84%
rename from data/cascaded_clustering.sh
rename to data/workflow/cascaded_clustering.sh
index 051de39..7696f65 100755
--- a/data/cascaded_clustering.sh
+++ b/data/workflow/cascaded_clustering.sh
@@ -203,43 +203,61 @@ if [ -n "$REASSIGN" ]; then
             || fail "Clustering step $STEP died"
 
     if [ -n "$REMOVE_TMP" ]; then
-        echo "Remove temporary files"
-        "$MMSEQS" rmdb "${TMP_PATH}/aln"
-        "$MMSEQS" rmdb "${TMP_PATH}/clu_not_accepted"
-        "$MMSEQS" rmdb "${TMP_PATH}/clu_accepted"
-        "$MMSEQS" rmdb "${TMP_PATH}/clu_not_accepted_swap"
-        "$MMSEQS" rmdb "${TMP_PATH}/seq_wrong_assigned"
-        "$MMSEQS" rmdb "${TMP_PATH}/seq_seeds"
-        "$MMSEQS" rmdb "${TMP_PATH}/seq_seeds.merged"
-        "$MMSEQS" rmdb "${TMP_PATH}/seq_wrong_assigned_pref"
-        "$MMSEQS" rmdb "${TMP_PATH}/seq_wrong_assigned_pref_swaped"
-        "$MMSEQS" rmdb "${TMP_PATH}/seq_wrong_assigned_pref_swaped_aln"
-        "$MMSEQS" rmdb "${TMP_PATH}/seq_wrong_assigned_pref_swaped_aln_ocol"
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/aln" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/clu_not_accepted" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/clu_accepted" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/clu_not_accepted_swap" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/seq_wrong_assigned" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/seq_seeds" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/seq_seeds.merged" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/seq_wrong_assigned_pref" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/seq_wrong_assigned_pref_swaped" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/seq_wrong_assigned_pref_swaped_aln" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/seq_wrong_assigned_pref_swaped_aln_ocol" ${VERBOSITY}
         rm -f "${TMP_PATH}/missing.single.seqs"
         rm -f "${TMP_PATH}/clu_accepted_plus_wrong.tsv"
-        "$MMSEQS" rmdb "${TMP_PATH}/missing.single.seqs.db"
-        "$MMSEQS" rmdb "${TMP_PATH}/clu_accepted_plus_wrong"
-        "$MMSEQS" rmdb "${TMP_PATH}/clu_accepted_plus_wrong_plus_single"
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/missing.single.seqs.db" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/clu_accepted_plus_wrong" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/clu_accepted_plus_wrong_plus_single" ${VERBOSITY}
 
     fi
 fi
 
 
 if [ -n "$REMOVE_TMP" ]; then
-    echo "Remove temporary files"
-    "$MMSEQS" rmdb "${TMP_PATH}/clu_redundancy"
-    "$MMSEQS" rmdb "${TMP_PATH}/input_step_redundancy"
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/clu_redundancy" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/input_step_redundancy" ${VERBOSITY}
     STEP=0
     while [ "$STEP" -lt "$STEPS" ]; do
-        "$MMSEQS" rmdb "${TMP_PATH}/pref_step$STEP"
-        "$MMSEQS" rmdb "${TMP_PATH}/aln_step$STEP"
-        "$MMSEQS" rmdb "${TMP_PATH}/clu_step$STEP"
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/pref_step$STEP" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/aln_step$STEP" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/clu_step$STEP" ${VERBOSITY}
         STEP=$((STEP+1))
     done
 
     STEP=1
     while [ "$STEP" -lt "$STEPS" ]; do
-        "$MMSEQS" rmdb "${TMP_PATH}/input_step$STEP"
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/input_step$STEP" ${VERBOSITY}
         STEP=$((STEP+1))
     done
 
diff --git a/data/clustering.sh b/data/workflow/clustering.sh
similarity index 81%
rename from data/clustering.sh
rename to data/workflow/clustering.sh
index dedeb8c..fa419b5 100755
--- a/data/clustering.sh
+++ b/data/workflow/clustering.sh
@@ -66,13 +66,18 @@ fi
         || fail "Merging of clusters has died"
 
 if [ -n "$REMOVE_TMP" ]; then
-    echo "Remove temporary files"
-    "$MMSEQS" rmdb "${TMP_PATH}/pref"
-    "$MMSEQS" rmdb "${TMP_PATH}/aln"
-    "$MMSEQS" rmdb "${TMP_PATH}/clu_step0"
-    "$MMSEQS" rmdb "${TMP_PATH}/clu_redundancy"
-    "$MMSEQS" rmdb "${TMP_PATH}/aln_redundancy"
-    "$MMSEQS" rmdb "${TMP_PATH}/input_step_redundancy"
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/pref" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/aln" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/clu_step0" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/clu_redundancy" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/aln_redundancy" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/input_step_redundancy" ${VERBOSITY}
     rm -f "${TMP_PATH}/order_redundancy"
     rm -f "${TMP_PATH}/clustering.sh"
 fi
diff --git a/data/createindex.sh b/data/workflow/createindex.sh
similarity index 68%
rename from data/createindex.sh
rename to data/workflow/createindex.sh
index 75ebe0c..d751273 100755
--- a/data/createindex.sh
+++ b/data/workflow/createindex.sh
@@ -19,39 +19,39 @@ if [ -n "$TRANSLATED" ]; then
     # 1. extract orf
     if notExists "$2/orfs_aa.dbtype"; then
         # shellcheck disable=SC2086
-        "$MMSEQS" extractorfs "$INPUT" "$2/orfs_aa" $ORF_PAR \
+        "$MMSEQS" extractorfs "$INPUT" "$2/orfs_aa" ${ORF_PAR} \
             || fail "extractorfs died"
     fi
 
     # shellcheck disable=SC2086
-    "$MMSEQS" $INDEXER "$2/orfs_aa" "$INPUT" $INDEX_PAR \
+    "$MMSEQS" $INDEXER "$2/orfs_aa" "$INPUT" ${INDEX_PAR} \
         || fail "indexdb died"
 
     if [ -n "$REMOVE_TMP" ]; then
-        echo "Remove temporary files"
-        "$MMSEQS" rmdb "$2/orfs_aa"
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "$2/orfs_aa" ${VERBOSITY}
         rm -f "$2/createindex.sh"
     fi
 elif [ -n "$LIN_NUCL" ] || [ -n "$NUCL" ]; then
       # 1. extract orf
     if notExists "$2/nucl_split_seq.dbtype"; then
         # shellcheck disable=SC2086
-        "$MMSEQS" splitsequence "$INPUT" "$2/nucl_split_seq" $SPLIT_SEQ_PAR \
+        "$MMSEQS" splitsequence "$INPUT" "$2/nucl_split_seq" ${SPLIT_SEQ_PAR} \
             || fail "splitsequence died"
     fi
 
     # shellcheck disable=SC2086
-    "$MMSEQS" $INDEXER "$2/nucl_split_seq" "$INPUT" $INDEX_PAR \
+    "$MMSEQS" $INDEXER "$2/nucl_split_seq" "$INPUT" ${INDEX_PAR} \
         || fail "indexdb died"
 
     if [ -n "$REMOVE_TMP" ]; then
-        echo "Remove temporary files"
-        "$MMSEQS" rmdb "$2/nucl_split_seq"
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "$2/nucl_split_seq" ${VERBOSITY}
         rm -f "$2/createindex.sh"
     fi
 else
     # shellcheck disable=SC2086
-    "$MMSEQS" $INDEXER "$INPUT" "$INPUT" $INDEX_PAR \
+    "$MMSEQS" $INDEXER "$INPUT" "$INPUT" ${INDEX_PAR} \
         || fail "indexdb died"
 fi
 
diff --git a/data/createtaxdb.sh b/data/workflow/createtaxdb.sh
similarity index 54%
rename from data/createtaxdb.sh
rename to data/workflow/createtaxdb.sh
index 595e48a..6e5e7ef 100755
--- a/data/createtaxdb.sh
+++ b/data/workflow/createtaxdb.sh
@@ -21,33 +21,30 @@ TMP_PATH="$2"
 
 if [ "$DOWNLOAD_NCBITAXDUMP" -eq "1" ]; then
     # Download NCBI taxon information
-    if notExists "$4/ncbi_download.complete"; then
+    if notExists "${TMP_PATH}/ncbi_download.complete"; then
         echo "Download taxdump.tar.gz"
-            wget -nv -O - "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz" \
+        wget -nv -O - "https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz" \
            | tar -C "${TMP_PATH}" -xzf - names.dmp nodes.dmp merged.dmp delnodes.dmp
         touch "${TMP_PATH}/ncbi_download.complete"
     fi
     NCBITAXINFO="${TMP_PATH}"
 fi
-if [ "$DOWNLOAD_MAPPING" -eq "1" ]; then
-    # Download the latest UniProt ID mapping to extract taxon identifiers
-    if notExists "${TMP_PATH}/mapping_download.complete"; then
-        echo "Download idmapping.dat.gz"
-        URL="ftp://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz"
-        wget -nv -O - "$URL" | zcat | awk '$2 == "NCBI_TaxID" {print $1"\t"$3 }' > "${TMP_PATH}/taxidmapping"
-        touch "${TMP_PATH}/mapping_download.complete"
+if notExists "${TAXDBNAME}_mapping"; then
+    if [ "$DOWNLOAD_MAPPING" -eq "1" ]; then
+        # Download the latest UniProt ID mapping to extract taxon identifiers
+        if notExists "${TMP_PATH}/mapping_download.complete"; then
+            echo "Download idmapping.dat.gz"
+            URL="https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz"
+            wget -nv -O - "$URL" | zcat | awk '$2 == "NCBI_TaxID" {print $1"\t"$3 }' > "${TMP_PATH}/taxidmapping"
+            touch "${TMP_PATH}/mapping_download.complete"
+        fi
+        MAPPINGFILE="${TMP_PATH}/taxidmapping"
     fi
-    MAPPINGFILE="${TMP_PATH}/taxidmapping"
-fi
-# create mapping
-if notExists "${TMP_PATH}/targetDB_mapping.complete"; then
     awk 'NR == FNR { f[$1] = $2; next } $2 in f { print $1"\t"f[$2] }' \
-        "$MAPPINGFILE" "${TAXDBNAME}.lookup" > "${TMP_PATH}/targetDB_mapping"
-    touch "${TMP_PATH}/targetDB_mapping.complete"
+        "$MAPPINGFILE" "${TAXDBNAME}.lookup" > "${TAXDBNAME}_mapping"
 fi
 
 # finalize database
-cp -f "${TMP_PATH}/targetDB_mapping" "${TAXDBNAME}_mapping"
 cp -f "${NCBITAXINFO}/names.dmp"     "${TAXDBNAME}_names.dmp"
 cp -f "${NCBITAXINFO}/nodes.dmp"     "${TAXDBNAME}_nodes.dmp"
 cp -f "${NCBITAXINFO}/merged.dmp"    "${TAXDBNAME}_merged.dmp"
@@ -55,13 +52,10 @@ cp -f "${NCBITAXINFO}/delnodes.dmp"  "${TAXDBNAME}_delnodes.dmp"
 echo "Database created"
 
 if [ -n "$REMOVE_TMP" ]; then
-   echo "Remove temporary files"
    rm -f "${TMP_PATH}/names.dmp" "${TMP_PATH}/nodes.dmp" "${TMP_PATH}/merged.dmp" "${TMP_PATH}/delnodes.dmp"
    rm -f "${TMP_PATH}/taxidmapping"
    if [ "$DOWNLOAD_DATA" -eq "1" ]; then
       rm -f "${TMP_PATH}/ncbi_download.complete" "${TMP_PATH}/mapping_download.complete"
    fi
-   rm -f "${TMP_PATH}/targetDB_mapping.complete"
-   rm -f "${TMP_PATH}/targetDB_mapping"
-   rm -f createtaxdb.sh
+   rm -f "${TMP_PATH}/createtaxdb.sh"
 fi
diff --git a/data/workflow/databases.sh b/data/workflow/databases.sh
new file mode 100644
index 0000000..d7c1427
--- /dev/null
+++ b/data/workflow/databases.sh
@@ -0,0 +1,292 @@
+#!/bin/sh -e
+fail() {
+    echo "Error: $1"
+    exit 1
+}
+
+notExists() {
+    [ ! -f "$1" ]
+}
+
+hasCommand () {
+    command -v "$1" >/dev/null 2>&1
+}
+
+ARR=""
+push_back() {
+    # shellcheck disable=SC1003
+    CURR="$(printf '%s' "$1" | awk '{ gsub(/'\''/, "'\''\\'\'''\''"); print; }')"
+    if [ -z "$ARR" ]; then
+        ARR=''\'$CURR\'''
+    else
+        ARR=$ARR' '\'$CURR\'''
+    fi
+}
+
+STRATEGY=""
+if hasCommand aria2c; then STRATEGY="$STRATEGY ARIA"; fi
+if hasCommand curl;   then STRATEGY="$STRATEGY CURL"; fi
+if hasCommand wget;   then STRATEGY="$STRATEGY WGET"; fi
+if [ "$STRATEGY" = "" ]; then
+    fail "No download tool found in PATH. Please install aria2c, curl or wget."
+fi
+
+downloadFile() {
+    URL="$1"
+    OUTPUT="$2"
+    set +e
+    for i in $STRATEGY; do
+        case "$i" in
+        ARIA)
+            FILENAME=$(basename "${OUTPUT}")
+            DIR=$(dirname "${OUTPUT}")
+            aria2c --max-connection-per-server="$ARIA_NUM_CONN" --allow-overwrite=true -o "$FILENAME" -d "$DIR" "$URL" && return 0
+            ;;
+        CURL)
+            curl -o "$OUTPUT" "$URL" && return 0
+            ;;
+        WGET)
+            wget -O "$OUTPUT" "$URL" && return 0
+            ;;
+        esac
+    done
+    set -e
+    fail "Could not download $URL to $OUTPUT"
+}
+
+# check number of input variables
+[ "$#" -ne 3 ] && echo "Please provide <selection> <outDB> <tmp>" && exit 1;
+[ ! -d "$3" ] &&  echo "tmp directory $3 not found!" && mkdir -p "$3";
+
+SELECTION="$1"
+OUTDB="$2"
+TMP_PATH="$3"
+
+INPUT_TYPE=""
+case "${SELECTION}" in
+    "UniRef100")
+        if notExists "${TMP_PATH}/uniref100.fasta.gz"; then
+            downloadFile "ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref100/uniref100.release_note" "${TMP_PATH}/version"
+            downloadFile "ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref100/uniref100.fasta.gz" "${TMP_PATH}/uniref100.fasta.gz"
+        fi
+        push_back "${TMP_PATH}/uniref100.fasta.gz"
+        INPUT_TYPE="FASTA_LIST"
+    ;;
+    "UniRef90")
+        if notExists "${TMP_PATH}/uniref90.fasta.gz"; then
+            downloadFile "ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.release_note" "${TMP_PATH}/version"
+            downloadFile "ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz" "${TMP_PATH}/uniref90.fasta.gz"
+        fi
+        push_back "${TMP_PATH}/uniref90.fasta.gz"
+        INPUT_TYPE="FASTA_LIST"
+    ;;
+    "UniRef50")
+        if notExists "${TMP_PATH}/uniref50.fasta.gz"; then
+            downloadFile "ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.release_note" "${TMP_PATH}/version"
+            downloadFile "ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz" "${TMP_PATH}/uniref50.fasta.gz"
+        fi
+        push_back "${TMP_PATH}/uniref50.fasta.gz"
+        INPUT_TYPE="FASTA_LIST"
+    ;;
+    "UniProtKB")
+        if notExists "${TMP_PATH}/uniprot_sprot.fasta.gz" || notExists "${TMP_PATH}/uniprot_trembl.fasta.gz"; then
+            downloadFile "https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/reldate.txt" "${TMP_PATH}/version"
+            downloadFile "https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" "${TMP_PATH}/uniprot_sprot.fasta.gz"
+            downloadFile "https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" "${TMP_PATH}/uniprot_trembl.fasta.gz"
+        fi
+        push_back "${TMP_PATH}/uniprot_sprot.fasta.gz"
+        push_back "${TMP_PATH}/uniprot_trembl.fasta.gz"
+        INPUT_TYPE="FASTA_LIST"
+    ;;
+    "UniProtKB/TrEMBL")
+        if notExists "${TMP_PATH}/uniprot_trembl.fasta.gz"; then
+          downloadFile "https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/reldate.txt" "${TMP_PATH}/version"
+          downloadFile "https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" "${TMP_PATH}/uniprot_trembl.fasta.gz"
+        fi
+        push_back "${TMP_PATH}/uniprot_trembl.fasta.gz"
+        INPUT_TYPE="FASTA_LIST"
+    ;;
+    "UniProtKB/Swiss-Prot")
+        if notExists "${TMP_PATH}/uniprot_sprot.fasta.gz"; then
+          downloadFile "https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/reldate.txt" "${TMP_PATH}/version"
+          downloadFile "https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" "${TMP_PATH}/uniprot_sprot.fasta.gz"
+        fi
+        push_back "${TMP_PATH}/uniprot_sprot.fasta.gz"
+        INPUT_TYPE="FASTA_LIST"
+    ;;
+    "NR")
+        if notExists "${TMP_PATH}/nr.gz"; then
+            date "+%s" > "${TMP_PATH}/version"
+            downloadFile "https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz" "${TMP_PATH}/nr.gz"
+        fi
+        push_back "${TMP_PATH}/nr.gz"
+        INPUT_TYPE="FASTA_LIST"
+    ;;
+    "NT")
+        if notExists "${TMP_PATH}/nt.gz"; then
+            date "+%s" > "${TMP_PATH}/version"
+            downloadFile "https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nt.gz" "${TMP_PATH}/nt.gz"
+        fi
+        push_back "${TMP_PATH}/nt.gz"
+        INPUT_TYPE="FASTA_LIST"
+    ;;
+    "PDB")
+        if notExists "${TMP_PATH}/pdb_seqres.txt.gz"; then
+            date "+%s" > "${TMP_PATH}/version"
+            downloadFile "https://ftp.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt.gz" "${TMP_PATH}/pdb_seqres.txt.gz"
+        fi
+        push_back "${TMP_PATH}/pdb_seqres.txt.gz"
+        INPUT_TYPE="FASTA_LIST"
+    ;;
+    "PDB70")
+        if notExists "${TMP_PATH}/msa.index"; then
+            date "+%s" > "${TMP_PATH}/version"
+            downloadFile "http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/pdb70_from_mmcif_latest.tar.gz" "${TMP_PATH}/pdb70.tar.gz"
+            tar -xOzf "${TMP_PATH}/pdb70.tar.gz" pdb70_a3m.ffdata | tr -d '\000' | awk -v outfile="${TMP_PATH}/msa" 'function writeEntry() { printf "%s\0", data >> outfile; size = length(data) + 1; data=""; print id"\t"offset"\t"size >> outindex; offset = offset + size; } BEGIN { data = ""; offset = 0; id = 1; if(length(outfile) == 0) { outfile="output"; } outindex = outfile".index"; printf("") > outfile; printf("") > outindex; printf("%c%c%c%c",11,0,0,0) > outfile".dbtype"; } /^>ss_/ { inss = 1; entry = 0; next; } inss == 1 { inss = 0; next; } /^>/ && entry == 0 { if (id > 1) { writeEntry(); } id = id + 1; data = ">"substr($1, 2)"\n"; entry = entry + 1; next; } entry > 0 { data = data""$0"\n"; entry = entry + 1; next; } END { writeEntry(); close(outfile); close(outfile".index"); }'
+            rm -f "${TMP_PATH}/pdb70.tar.gz"
+        fi
+        INPUT_TYPE="A3M"
+    ;;
+    "Pfam-A.full")
+        if notExists "${TMP_PATH}/db.msa.gz"; then
+            downloadFile "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam.version.gz" "${TMP_PATH}/version"
+            downloadFile "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.full.gz" "${TMP_PATH}/db.msa.gz"
+        fi
+        INPUT_TYPE="MSA"
+    ;;
+    "Pfam-A.seed")
+        if notExists "${TMP_PATH}/db.msa.gz"; then
+            downloadFile "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam.version.gz" "${TMP_PATH}/version"
+            downloadFile "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.seed.gz" "${TMP_PATH}/db.msa.gz"
+        fi
+        INPUT_TYPE="MSA"
+    ;;
+    "eggNOG")
+        if notExists "${TMP_PATH}/download.done"; then
+            date "+%s" > "${TMP_PATH}/version"
+            downloadFile "http://eggnogdb.embl.de/download/eggnog_5.0/per_tax_level/2/2_raw_algs.tar" "${TMP_PATH}/bacteria"
+            downloadFile "http://eggnogdb.embl.de/download/eggnog_5.0/per_tax_level/2157/2157_raw_algs.tar" "${TMP_PATH}/archea"
+            downloadFile "http://eggnogdb.embl.de/download/eggnog_5.0/per_tax_level/2759/2759_raw_algs.tar" "${TMP_PATH}/eukaryota"
+            downloadFile "http://eggnogdb.embl.de/download/eggnog_5.0/per_tax_level/10239/10239_raw_algs.tar" "${TMP_PATH}/viruses"
+            touch "${TMP_PATH}/download.done"
+        fi
+        INPUT_TYPE="eggNOG"
+    ;;
+    "Resfinder")
+        if notExists "${TMP_PATH}/download.done"; then
+            downloadFile "https://api.bitbucket.org/2.0/repositories/genomicepidemiology/resfinder_db/commit/master?fields=hash,date" "${TMP_PATH}/version"
+            downloadFile "https://bitbucket.org/genomicepidemiology/resfinder_db/get/master.tar.gz" "${TMP_PATH}/master.tar.gz"
+            tar -C "${TMP_PATH}" --strip-components=1 -xzvf "${TMP_PATH}/master.tar.gz" "*.fsa"
+            rm -f "${TMP_PATH}/master.tar.gz"
+            touch "${TMP_PATH}/download.done"
+        fi
+        INPUT_TYPE="FSA"
+    ;;
+    "Kalamari")
+        if notExists "${TMP_PATH}/kalamari.tsv"; then
+            printf "3.7 %s\n" "$(date "+%s")" > "${TMP_PATH}/version"
+            downloadFile "https://raw.githubusercontent.com/lskatz/Kalamari/master/src/Kalamari_v3.7.tsv" "${TMP_PATH}/kalamari.tsv"
+        fi
+        ACCESSIONS=""
+        # shellcheck disable=SC2034
+        while IFS="$(printf '\t')" read -r NAME ACCESSION TAXID; do
+            if [ "$NAME" = "scientificName" ]; then
+                continue
+            fi
+            case "${ACCESSION}" in XXX*)
+                continue
+            esac
+            if [ -z "$ACCESSIONS" ]; then
+                ACCESSIONS="$ACCESSION"
+            else
+                ACCESSIONS="$ACCESSIONS,$ACCESSION"
+            fi
+        done < "${TMP_PATH}/kalamari.tsv"
+        if notExists "${TMP_PATH}/kalamari.fasta"; then
+            # Reset download strategy to not use aria2c for NCBI
+            STRATEGY=""
+            if hasCommand curl; then STRATEGY="$STRATEGY CURL"; fi
+            if hasCommand wget; then STRATEGY="$STRATEGY WGET"; fi
+            if [ "$STRATEGY" = "" ]; then
+                fail "No download tool found in PATH. Please install aria2c, curl or wget."
+            fi
+            downloadFile "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=${ACCESSIONS}&rettype=fasta&retmode=txt" "${TMP_PATH}/kalamari.fasta.tmp"
+            awk '/<!DOCTYPE/ { exit 1; } ' "${TMP_PATH}/kalamari.fasta.tmp" || fail "Could not download genomes from NCBI. Please try again later."
+            awk -F '[\t>.]' 'NR == FNR { f[$2]=$NF; next; } /^>/{ print $0" TaxID="f[$2]" "; next; } { print; }' "${TMP_PATH}/kalamari.tsv" "${TMP_PATH}/kalamari.fasta.tmp" > "${TMP_PATH}/kalamari.fasta"
+            rm -f "${TMP_PATH}/kalamari.fasta.tmp"
+        fi
+        push_back "${TMP_PATH}/kalamari.fasta"
+        INPUT_TYPE="FASTA_LIST"
+    ;;
+esac
+
+if notExists "${OUTDB}.dbtype"; then
+case "${INPUT_TYPE}" in
+    "FASTA_LIST")
+        eval "set -- $ARR"
+        # shellcheck disable=SC2086
+        "${MMSEQS}" createdb "${@}" "${OUTDB}" ${COMP_PAR} \
+            || fail "createdb died"
+        for i in "${@}"; do
+            rm -f -- "$i"
+        done
+    ;;
+    "FSA")
+        # shellcheck disable=SC2086
+        "${MMSEQS}" createdb "${TMP_PATH}/"*.fsa "${OUTDB}" ${COMP_PAR} \
+            || fail "createdb died"
+        rm -f -- "${TMP_PATH}/"*.fsa
+    ;;
+    "A3M")
+        # shellcheck disable=SC2086
+        "${MMSEQS}" msa2profile "${TMP_PATH}/msa" "${OUTDB}" --match-mode 1 --match-ratio 0.5 --msa-type 1 ${THREADS_PAR} \
+            || fail "msa2profile died"
+    ;;
+    "MSA")
+        # shellcheck disable=SC2086
+        "${MMSEQS}" convertmsa "${TMP_PATH}/db.msa.gz" "${TMP_PATH}/msa" ${VERB_PAR} \
+            || fail "convertmsa died"
+        rm -f "${TMP_PATH}/db.msa.gz"
+        # shellcheck disable=SC2086
+        "${MMSEQS}" msa2profile "${TMP_PATH}/msa" "${OUTDB}" --match-mode 1 --match-ratio 0.5 ${THREADS_PAR} \
+            || fail "msa2profile died"
+        "${MMSEQS}" rmdb "${TMP_PATH}/msa" \
+            || fail "rmdb died"
+    ;;
+    "eggNOG")
+        # shellcheck disable=SC2086
+        "${MMSEQS}" tar2db "${TMP_PATH}/bacteria" "${TMP_PATH}/archea" "${TMP_PATH}/eukaryota" "${TMP_PATH}/viruses" "${TMP_PATH}/msa" --output-dbtype 11 --tar-include '\.raw_alg\.faa\.gz$' ${COMP_PAR} \
+            || fail "msa2profile died"
+        rm -f "${TMP_PATH}/bacteria.tar" "${TMP_PATH}/archea.tar" "${TMP_PATH}/eukaryota.tar" "${TMP_PATH}/viruses.tar"
+        sed 's|\.raw_alg\.faa\.gz||g' "${TMP_PATH}/msa.lookup" > "${TMP_PATH}/msa.lookup.tmp"
+        mv -f "${TMP_PATH}/msa.lookup.tmp" "${TMP_PATH}/msa.lookup"
+        # shellcheck disable=SC2086
+        "${MMSEQS}" msa2profile "${TMP_PATH}/msa" "${OUTDB}" --match-mode 1 --match-ratio 0.5 ${THREADS_PAR} \
+            || fail "msa2profile died"
+        mv -f "${TMP_PATH}/msa.lookup" "${OUTDB}.lookup"
+        mv -f "${TMP_PATH}/msa.source" "${OUTDB}.source"
+        "${MMSEQS}" rmdb "${TMP_PATH}/msa" \
+            || fail "rmdb died"
+    ;;
+esac
+fi
+
+if [ -n "${TAXONOMY}" ] && notExists "${OUTDB}_mapping"; then
+    # shellcheck disable=SC2086
+    "${MMSEQS}" prefixid "${OUTDB}_h" "${TMP_PATH}/header_pref.tsv" --tsv ${THREADS_PAR} \
+        || fail "prefixid died"
+    awk '{ match($0, / OX=[0-9]+ /); if (RLENGTH != -1) { print $1"\t"substr($0, RSTART+4, RLENGTH-5); next; } match($0, / TaxID=[0-9]+ /); print $1"\t"substr($0, RSTART+7, RLENGTH-8); }' "${TMP_PATH}/header_pref.tsv" \
+        | LC_ALL=C sort -n > "${OUTDB}_mapping"
+    rm -f "${TMP_PATH}/header_pref.tsv"
+    # shellcheck disable=SC2086
+    "${MMSEQS}" createtaxdb "${OUTDB}" "${TMP_PATH}/taxonomy" ${THREADS_PAR} \
+        || fail "createtaxdb died"
+fi
+
+if notExists "${OUTDB}.version"; then
+    mv -f "${TMP_PATH}/version" "${OUTDB}.version"
+fi
+
+if [ -n "${REMOVE_TMP}" ]; then
+    rm -f "${TMP_PATH}/download.sh"
+fi
diff --git a/data/easycluster.sh b/data/workflow/easycluster.sh
similarity index 81%
rename from data/easycluster.sh
rename to data/workflow/easycluster.sh
index 2ea55c4..2852d37 100755
--- a/data/easycluster.sh
+++ b/data/workflow/easycluster.sh
@@ -52,12 +52,16 @@ mv "${TMP_PATH}/rep_seq.fasta"  "${RESULTS}_rep_seq.fasta"
 mv "${TMP_PATH}/cluster.tsv"  "${RESULTS}_cluster.tsv"
 
 if [ -n "${REMOVE_TMP}" ]; then
-    echo "Removing temporary files"
-    "$MMSEQS" rmdb "${TMP_PATH}/input"
-    "$MMSEQS" rmdb "${TMP_PATH}/input_h"
-    "$MMSEQS" rmdb "${TMP_PATH}/clu_seqs"
-    "$MMSEQS" rmdb "${TMP_PATH}/clu_rep"
-    "$MMSEQS" rmdb "${TMP_PATH}/clu"
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/clu_seqs" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/clu_rep" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/clu" ${VERBOSITY_PAR}
     rm -rf "${TMP_PATH}/clu_tmp"
     rm -f "${TMP_PATH}/easycluster.sh"
 fi
diff --git a/data/workflow/easyrbh.sh b/data/workflow/easyrbh.sh
new file mode 100755
index 0000000..016350c
--- /dev/null
+++ b/data/workflow/easyrbh.sh
@@ -0,0 +1,56 @@
+#!/bin/sh -e
+fail() {
+    echo "Error: $1"
+    exit 1
+}
+
+notExists() {
+	[ ! -f "$1" ]
+}
+
+if notExists "${TMP_PATH}/query.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" createdb "${QUERY}" "${TMP_PATH}/query" ${CREATEDB_QUERY_PAR} \
+        || fail "query createdb died"
+    QUERY="${TMP_PATH}/query"
+fi
+
+if notExists "${TARGET}.dbtype"; then
+    if notExists "${TMP_PATH}/target"; then
+        # shellcheck disable=SC2086
+        "$MMSEQS" createdb "${TARGET}" "${TMP_PATH}/target" ${CREATEDB_PAR} \
+            || fail "target createdb died"
+    fi
+    TARGET="${TMP_PATH}/target"
+fi
+
+if notExists "${INTERMEDIATE}.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" rbh "${QUERY}" "${TARGET}" "${TMP_PATH}/result" "${TMP_PATH}/rbh_tmp" ${SEARCH_PAR} \
+        || fail "Search died"
+fi
+
+if notExists "${TMP_PATH}/alis.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" convertalis "${QUERY}" "${TARGET}" "${TMP_PATH}/result" "${RESULTS}" ${CONVERT_PAR} \
+        || fail "Convert Alignments died"
+fi
+
+if [ -n "${REMOVE_TMP}" ]; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/result" ${VERBOSITY}
+    if [ -z "${LEAVE_INPUT}" ]; then
+        if [ -f "${TMP_PATH}/target" ]; then
+            # shellcheck disable=SC2086
+            "$MMSEQS" rmdb "${TMP_PATH}/target" ${VERBOSITY}
+            # shellcheck disable=SC2086
+            "$MMSEQS" rmdb "${TMP_PATH}/target_h" ${VERBOSITY}
+        fi
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/query" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/query_h" ${VERBOSITY}
+    fi
+    rm -rf "${TMP_PATH}/rbh_tmp"
+    rm -f "${TMP_PATH}/easyrbh.sh"
+fi
diff --git a/data/easysearch.sh b/data/workflow/easysearch.sh
similarity index 76%
rename from data/easysearch.sh
rename to data/workflow/easysearch.sh
index bd922d7..9346f5e 100755
--- a/data/easysearch.sh
+++ b/data/workflow/easysearch.sh
@@ -57,18 +57,23 @@ fi
 
 
 if [ -n "${REMOVE_TMP}" ]; then
-    echo "Removing temporary files"
     if [ -n "${GREEDY_BEST_HITS}" ]; then
-        "$MMSEQS" rmdb "${TMP_PATH}/result_best"
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/result_best" ${VERBOSITY}
     fi
-    "$MMSEQS" rmdb "${TMP_PATH}/result"
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/result" ${VERBOSITY}
     if [ -z "${LEAVE_INPUT}" ]; then
         if [ -f "${TMP_PATH}/target" ]; then
-            "$MMSEQS" rmdb "${TMP_PATH}/target"
-            "$MMSEQS" rmdb "${TMP_PATH}/target_h"
+            # shellcheck disable=SC2086
+            "$MMSEQS" rmdb "${TMP_PATH}/target" ${VERBOSITY}
+            # shellcheck disable=SC2086
+            "$MMSEQS" rmdb "${TMP_PATH}/target_h" ${VERBOSITY}
         fi
-        "$MMSEQS" rmdb "${TMP_PATH}/query"
-        "$MMSEQS" rmdb "${TMP_PATH}/query_h"
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/query" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/query_h" ${VERBOSITY}
     fi
     rm -rf "${TMP_PATH}/search_tmp"
     rm -f "${TMP_PATH}/easysearch.sh"
diff --git a/data/easytaxonomy.sh b/data/workflow/easytaxonomy.sh
similarity index 82%
rename from data/easytaxonomy.sh
rename to data/workflow/easytaxonomy.sh
index d7bd258..6c1021b 100755
--- a/data/easytaxonomy.sh
+++ b/data/workflow/easytaxonomy.sh
@@ -75,16 +75,22 @@ if notExists "${RESULTS}_tophit_aln"; then
 fi
 
 if [ -n "${REMOVE_TMP}" ]; then
-    echo "Removing temporary files"
+    # shellcheck disable=SC2086
     "$MMSEQS" rmdb "${TMP_PATH}/result"
     if [ -z "${LEAVE_INPUT}" ]; then
-        "$MMSEQS" rmdb "${TMP_PATH}/query"
-        "$MMSEQS" rmdb "${TMP_PATH}/query_h"
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/query" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/query_h" ${VERBOSITY}
     fi
-    "$MMSEQS" rmdb "${TMP_PATH}/result_top1"
-    "$MMSEQS" rmdb "${TMP_PATH}/result_top1_swapped"
-    "$MMSEQS" rmdb "${TMP_PATH}/result_top1_swapped_sum"
-    "$MMSEQS" rmdb "${TMP_PATH}/result_top1_swapped_sum_tax"
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/result_top1" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/result_top1_swapped" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/result_top1_swapped_sum" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/result_top1_swapped_sum_tax" ${VERBOSITY}
 
     rm -rf "${TMP_PATH}/taxonomy_tmp"
     rm -f "${TMP_PATH}/easytaxonomy.sh"
diff --git a/data/enrich.sh b/data/workflow/enrich.sh
similarity index 99%
rename from data/enrich.sh
rename to data/workflow/enrich.sh
index bd856dd..1dd0597 100755
--- a/data/enrich.sh
+++ b/data/workflow/enrich.sh
@@ -118,7 +118,6 @@ mv -f "${TMP_PATH}/aln_0.index" "${RESULT}.index"
 mv -f "${TMP_PATH}/aln_0.dbtype" "${RESULT}.dbtype"
 
 if [ -n "$REMOVE_TMP" ]; then
-    echo "Remove temporary files"
     STEP=0
     while [ "${STEP}" -lt "${NUM_IT}" ]; do
         rm -f "${TMP_PATH}/pref_${STEP}" "${TMP_PATH}/pref_${STEP}.index" "${TMP_PATH}/pref_${STEP}.dbtype"
diff --git a/data/linclust.sh b/data/workflow/linclust.sh
similarity index 77%
rename from data/linclust.sh
rename to data/workflow/linclust.sh
index 321e807..babda71 100755
--- a/data/linclust.sh
+++ b/data/workflow/linclust.sh
@@ -50,7 +50,8 @@ if notExists "${TMP_PATH}/pref_filter1"; then
 fi
 
 if notExists "${TMP_PATH}/pref_filter2"; then
-    "$MMSEQS" filterdb "${TMP_PATH}/pref_filter1" "${TMP_PATH}/pref_filter2" --filter-file "${TMP_PATH}/order_redundancy" \
+    # shellcheck disable=SC2086
+    "$MMSEQS" filterdb "${TMP_PATH}/pref_filter1" "${TMP_PATH}/pref_filter2" --filter-file "${TMP_PATH}/order_redundancy" ${VERBOSITYANDCOMPRESS} \
         || fail "Filterdb step died"
 fi
 
@@ -88,23 +89,30 @@ if notExists "${TMP_PATH}/clu"; then
 fi
 
 if [ -n "$REMOVE_TMP" ]; then
-    echo "Remove temporary files"
-    "$MMSEQS" rmdb "${TMP_PATH}/pref"
-    "$MMSEQS" rmdb "${TMP_PATH}/pref_rescore1"
-    "$MMSEQS" rmdb "${TMP_PATH}/pre_clust"
-    "$MMSEQS" rmdb "${TMP_PATH}/input_step_redundancy"
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/pref" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/pref_rescore1" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/pre_clust" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/input_step_redundancy" ${VERBOSITY}
     rm -f "${TMP_PATH}/order_redundancy"
 
-    "$MMSEQS" rmdb "${TMP_PATH}/pref_filter1"
-    "$MMSEQS" rmdb "${TMP_PATH}/pref_filter2"
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/pref_filter1" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/pref_filter2" ${VERBOSITY}
 
     if [ -n "${ALIGN_GAPPED}" ]; then
         if [ -n "$FILTER" ]; then
-            "$MMSEQS" rmdb "${TMP_PATH}/pref_rescore2"
+            # shellcheck disable=SC2086
+            "$MMSEQS" rmdb "${TMP_PATH}/pref_rescore2" ${VERBOSITY}
         fi
-        "$MMSEQS" rmdb "${TMP_PATH}/aln"
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/aln" ${VERBOSITY}
     fi
-    "$MMSEQS" rmdb "${TMP_PATH}/clust"
-
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/clust" ${VERBOSITY}
     rm -f "${TMP_PATH}/linclust.sh"
 fi
diff --git a/data/linsearch.sh b/data/workflow/linsearch.sh
similarity index 94%
rename from data/linsearch.sh
rename to data/workflow/linsearch.sh
index 32a46cc..61d014e 100755
--- a/data/linsearch.sh
+++ b/data/workflow/linsearch.sh
@@ -95,8 +95,9 @@ fi
 
 
 if [ -n "$REMOVE_TMP" ]; then
-    echo "Remove temporary files"
-    "$MMSEQS" rmdb "${TMP_PATH}/pref"
-    "$MMSEQS" rmdb "${TMP_PATH}/reverse_aln"
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/pref" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/reverse_aln" ${VERBOSITY}
     rm -f "${TMP_PATH}/linsearch.sh"
 fi
diff --git a/data/map.sh b/data/workflow/map.sh
similarity index 100%
rename from data/map.sh
rename to data/workflow/map.sh
diff --git a/data/multihitdb.sh b/data/workflow/multihitdb.sh
similarity index 99%
rename from data/multihitdb.sh
rename to data/workflow/multihitdb.sh
index e2a5d04..407acd0 100755
--- a/data/multihitdb.sh
+++ b/data/workflow/multihitdb.sh
@@ -85,7 +85,6 @@ else
 fi
 
 if [ -n "${REMOVE_TMP}" ]; then
-    echo "Remove temporary files"
     rmdir "${TMP_PATH}/search"
     if [ -n "${NUCL}" ]; then
         rm -f "${TMP_PATH}/nucl_set.tsv"
diff --git a/data/multihitsearch.sh b/data/workflow/multihitsearch.sh
similarity index 89%
rename from data/multihitsearch.sh
rename to data/workflow/multihitsearch.sh
index 9fbaeb4..6675606 100644
--- a/data/multihitsearch.sh
+++ b/data/workflow/multihitsearch.sh
@@ -42,10 +42,11 @@ if notExists "${OUTPUT}.index"; then
 fi
 
 if [ -n "${REMOVE_TMP}" ]; then
-    echo "Remove temporary files"
     rmdir "${TMP_PATH}/search"
-    "$MMSEQS" rmdb "${TMP_PATH}/result"
-    "$MMSEQS" rmdb "${TMP_PATH}/aggregate"
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/result" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/aggregate" ${VERBOSITY}
     rm -f "${TMP_PATH}/multihitsearch.sh"
 fi
 
diff --git a/data/workflow/nucleotide_clustering.sh b/data/workflow/nucleotide_clustering.sh
new file mode 100755
index 0000000..c77cda8
--- /dev/null
+++ b/data/workflow/nucleotide_clustering.sh
@@ -0,0 +1,125 @@
+#!/bin/sh -e
+fail() {
+    echo "Error: $1"
+    exit 1
+}
+
+notExists() {
+	[ ! -f "$1" ]
+}
+
+# check number of input variables
+[ "$#" -ne 3 ] && echo "Please provide <sequenceDB> <outDB> <tmp>" && exit 1;
+# check if files exist
+[ ! -f "$1.dbtype" ] && echo "$1.dbtype not found!" && exit 1;
+[   -f "$2.dbtype" ] && echo "$2.dbtype exists already!" && exit 1;
+[ ! -d "$3" ] && echo "tmp directory $3 not found!" && mkdir -p "$3";
+
+INPUT="$1"
+TMP_PATH="$3"
+ORIGINAL="$INPUT"
+
+mkdir -p "${TMP_PATH}/linclust"
+if notExists "${TMP_PATH}/clu_redundancy.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" linclust "$INPUT" "${TMP_PATH}/clu_redundancy" "${TMP_PATH}/linclust" ${LINCLUST_PAR} \
+        || fail "linclust died"
+fi
+
+if notExists "${TMP_PATH}/input_step_redundancy.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" createsubdb "${TMP_PATH}/clu_redundancy" "$INPUT" "${TMP_PATH}/input_step_redundancy" ${VERBOSITY} --subdb-mode 1 \
+        || faill "createsubdb died"
+fi
+
+INPUT="${TMP_PATH}/input_step_redundancy"
+
+if notExists "$TMP_PATH/query_seqs.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" extractframes "$INPUT" "${TMP_PATH}/query_seqs" ${EXTRACT_FRAMES_PAR}  \
+        || fail "Extractframes died"
+fi
+QUERY="$TMP_PATH/query_seqs"
+
+if notExists "${TMP_PATH}/pref.dbtype"; then
+    # shellcheck disable=SC2086
+    $RUNNER "$MMSEQS" prefilter "$QUERY" "$INPUT" "${TMP_PATH}/pref" ${PREFILTER_PAR} \
+        || fail "Prefilter step died"
+fi
+
+
+if [ -n "$ALIGNMENT_MODE_NOT_SET" ]; then
+
+    if notExists "${TMP_PATH}/aln_rescore.dbtype"; then
+        # shellcheck disable=SC2086
+        $RUNNER "$MMSEQS" rescorediagonal "$QUERY" "$INPUT" "${TMP_PATH}/pref" "${TMP_PATH}/aln_ungapped" ${RESCORE_ALN_PAR}  \
+             || fail "Alignment step died"
+    fi
+
+    if notExists "${TMP_PATH}/pref_subtract.dbtype"; then
+        # shellcheck disable=SC2086
+        $RUNNER "$MMSEQS" subtractdbs "${TMP_PATH}/pref" "${TMP_PATH}/aln_ungapped" "${TMP_PATH}/pref_subtract" ${THREADSANDCOMPRESS_PAR}  \
+             || fail "Alignment step died"
+    fi
+
+    if notExists "${TMP_PATH}/aln_gapped.dbtype"; then
+        # shellcheck disable=SC2086
+        $RUNNER "$MMSEQS" align "$QUERY" "$INPUT" "${TMP_PATH}/pref_subtract" "${TMP_PATH}/aln_gapped" ${ALIGNMENT_PAR}  \
+             || fail "Alignment step died"
+    fi
+
+    if notExists "${TMP_PATH}/aln.dbtype"; then
+            # shellcheck disable=SC2086
+         "$MMSEQS" concatdbs "${TMP_PATH}/aln_ungapped" "${TMP_PATH}/aln_gapped" "${TMP_PATH}/aln" --preserve-keys --take-larger-entry ${THREADSANDCOMPRESS_PAR}\
+             || fail "Mergedbs died"
+    fi
+
+else
+    if notExists "${TMP_PATH}/aln.dbtype"; then
+        # shellcheck disable=SC2086
+        $RUNNER "$MMSEQS" "${ALIGN_MODULE}" "$QUERY" "$INPUT" "${TMP_PATH}/pref" "${TMP_PATH}/aln" ${ALIGNMENT_PAR}  \
+             || fail "Alignment step died"
+    fi
+
+fi
+
+
+
+if notExists "${TMP_PATH}/aln_off"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" offsetalignment "${TMP_PATH}/input_step_redundancy" "${QUERY}" \
+                              "${TMP_PATH}/input_step_redundancy" "${TMP_PATH}/input_step_redundancy" \
+                              "${TMP_PATH}/aln" "${TMP_PATH}/aln_off" ${OFFSETALIGNMENT_PAR} \
+        || fail "Offset step died"
+fi
+
+if notExists "${TMP_PATH}/clu.dbtype"; then
+     # shellcheck disable=SC2086
+     "$MMSEQS" clust "$INPUT" "${TMP_PATH}/aln_off" "${TMP_PATH}/clu" ${CLUSTER_PAR} \
+          || fail "Clustering step died"
+fi
+
+# merge clu_redundancy and clu
+# shellcheck disable=SC2086
+"$MMSEQS" mergeclusters "$ORIGINAL" "$2" "${TMP_PATH}/clu_redundancy" "${TMP_PATH}/clu" ${MERGECLU_PAR} \
+        || fail "Merging of clusters has died"
+
+if [ -n "$REMOVE_TMP" ]; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/query_seqs" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/clu_redundancy" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/input_step_redundancy" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/perf" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/aln" ${VERBOSITY}
+        # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/aln_off" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/clu" ${VERBOSITY}
+
+    rm -f "${TMP_PATH}/nucleotide_clustering.sh"
+fi
+
diff --git a/data/rbh.sh b/data/workflow/rbh.sh
similarity index 84%
rename from data/rbh.sh
rename to data/workflow/rbh.sh
index e0d0c9d..8b4e224 100755
--- a/data/rbh.sh
+++ b/data/workflow/rbh.sh
@@ -60,7 +60,7 @@ fi
 # swap the direction of resB_best_A:
 if [ ! -e "${TMP_PATH}/resB_best_A_swap.dbtype" ]; then
     # shellcheck disable=SC2086
-    "$MMSEQS" swapdb "${TMP_PATH}/resB_best_A" "${TMP_PATH}/resB_best_A_swap" ${THREADS_COMP_PAR} \
+    "$MMSEQS" swapresults "${B_DB}" "${A_DB}" "${TMP_PATH}/resB_best_A" "${TMP_PATH}/resB_best_A_swap" ${THREADS_COMP_PAR} -e 100000000 \
         || fail "swap B best A died"
 fi
 
@@ -86,13 +86,17 @@ if [ ! -e "${RBH_RES}.dbtype" ]; then
 fi
 
 if [ -n "$REMOVE_TMP" ]; then
-    echo "Remove temporary files"
     rm -rf "${TMP_PATH}/tempAB"
     rm -rf "${TMP_PATH}/tempBA"
-    "$MMSEQS" rmdb "${TMP_PATH}/resAB"
-    "$MMSEQS" rmdb "${TMP_PATH}/resBA"
-    "$MMSEQS" rmdb "${TMP_PATH}/resA_best_B"
-    "$MMSEQS" rmdb "${TMP_PATH}/resB_best_A"
-    "$MMSEQS" rmdb "${TMP_PATH}/res_best"
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/resAB" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/resBA" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/resA_best_B" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/resB_best_A" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/res_best" ${VERBOSITY}
     rm -f "${TMP_PATH}/rbh.sh"
 fi
diff --git a/data/searchslicedtargetprofile.sh b/data/workflow/searchslicedtargetprofile.sh
similarity index 91%
rename from data/searchslicedtargetprofile.sh
rename to data/workflow/searchslicedtargetprofile.sh
index 39e286e..75335f2 100755
--- a/data/searchslicedtargetprofile.sh
+++ b/data/workflow/searchslicedtargetprofile.sh
@@ -79,15 +79,16 @@ while [ "${FIRST_INDEX_LINE}" -le "${TOTAL_NUM_PROFILES}" ]; do
             NUM_SEQS_THAT_SATURATE=1
         fi
     fi
-
+    # prefilter res size (10 + 1 + 6 + 1 + 3 + 1) + 3 byte buffer
+    RESSIZE=25
     # disk usage allowance not set by the user (i.e. AVAIL_DISK = 0), compute it for optimal usage
     if [ "${AVAIL_DISK}" -eq 0 ]; then
         CURRENT_AVAIL_DISK_SPACE=$(($("$MMSEQS" diskspaceavail "${TMP_PATH}")/2))
         # Compute the max number of profiles that can be processed
         # based on the number of hits that saturate
-        NUM_PROFS_IN_STEP="$((CURRENT_AVAIL_DISK_SPACE/NUM_SEQS_THAT_SATURATE/25))"
+        NUM_PROFS_IN_STEP="$((CURRENT_AVAIL_DISK_SPACE/NUM_SEQS_THAT_SATURATE/RESSIZE))"
     else
-        NUM_PROFS_IN_STEP="$((AVAIL_DISK/NUM_SEQS_THAT_SATURATE/25))"
+        NUM_PROFS_IN_STEP="$((AVAIL_DISK/NUM_SEQS_THAT_SATURATE/RESSIZE))"
     fi
 
     # no matter what, process at least one profile...
@@ -128,7 +129,7 @@ while [ "${FIRST_INDEX_LINE}" -le "${TOTAL_NUM_PROFILES}" ]; do
         ${RUNNER} "$MMSEQS" align "${PROFILEDB}" "${INPUT}" "${TMP_PATH}/pref" "${TMP_PATH}/aln" ${ALIGNMENT_PAR} \
             || fail "align died"
         # shellcheck disable=SC2086
-        "$MMSEQS" rmdb "${TMP_PATH}/pref" ${VERBOSITY_PAR}
+        "$MMSEQS" rmdb "${TMP_PATH}/pref" ${VERBOSITY}
         touch "${TMP_PATH}/aln.done"
     fi
 
@@ -139,25 +140,25 @@ while [ "${FIRST_INDEX_LINE}" -le "${TOTAL_NUM_PROFILES}" ]; do
         "$MMSEQS" swapresults "${TARGET}" "${INPUT}" "${TMP_PATH}/aln" "${TMP_PATH}/aln_swap" ${SWAP_PAR} \
             || fail "swapresults died"
         # shellcheck disable=SC2086
-        "$MMSEQS" rmdb "${TMP_PATH}/aln" ${VERBOSITY_PAR}
+        "$MMSEQS" rmdb "${TMP_PATH}/aln" ${VERBOSITY}
         touch "${TMP_PATH}/aln_swap.done"
     fi
 
     # merge swapped alignment of current chunk to previous steps
     if [ -f "${TMP_PATH}/aln_merged.dbtype" ]; then
         # shellcheck disable=SC2086
-        "$MMSEQS" mergedbs "${INPUT}" "${TMP_PATH}/aln_merged_new" "${TMP_PATH}/aln_merged" "${TMP_PATH}/aln_swap" ${VERBOSITY_PAR} \
+        "$MMSEQS" mergedbs "${INPUT}" "${TMP_PATH}/aln_merged_new" "${TMP_PATH}/aln_merged" "${TMP_PATH}/aln_swap" ${VERBOSITY} \
             || fail "mergedbs died"
         # rmdb of aln_merged to avoid conflict with unmerged dbs: aln_merged.0, .1...
         # shellcheck disable=SC2086
-        "$MMSEQS" rmdb "${TMP_PATH}/aln_merged" ${VERBOSITY_PAR} || fail "rmdb aln_merged died"
+        "$MMSEQS" rmdb "${TMP_PATH}/aln_merged" ${VERBOSITY} || fail "rmdb aln_merged died"
         # shellcheck disable=SC2086
-        "$MMSEQS" mvdb "${TMP_PATH}/aln_merged_new" "${TMP_PATH}/aln_merged" ${VERBOSITY_PAR} || fail "mv aln_merged_new aln_merged died"
+        "$MMSEQS" mvdb "${TMP_PATH}/aln_merged_new" "${TMP_PATH}/aln_merged" ${VERBOSITY} || fail "mv aln_merged_new aln_merged died"
         # shellcheck disable=SC2086
-        "$MMSEQS" rmdb "${TMP_PATH}/aln_swap" ${VERBOSITY_PAR} || fail "rmdb aln_swap died"
+        "$MMSEQS" rmdb "${TMP_PATH}/aln_swap" ${VERBOSITY} || fail "rmdb aln_swap died"
     else
         # shellcheck disable=SC2086
-        "$MMSEQS" mvdb "${TMP_PATH}/aln_swap" "${TMP_PATH}/aln_merged" ${VERBOSITY_PAR} \
+        "$MMSEQS" mvdb "${TMP_PATH}/aln_swap" "${TMP_PATH}/aln_merged" ${VERBOSITY} \
             || fail "mvdb died"
     fi
 
@@ -175,11 +176,10 @@ done
 
 
 if [ -n "$REMOVE_TMP" ]; then
-    echo "Remove temporary files"
     # shellcheck disable=SC2086
-    "$MMSEQS" rmdb "${TMP_PATH}/aln_merged" ${VERBOSITY_PAR}
+    "$MMSEQS" rmdb "${TMP_PATH}/aln_merged" ${VERBOSITY}
     # shellcheck disable=SC2086
-    "$MMSEQS" rmdb "${PROFILEDB}" ${VERBOSITY_PAR}
+    "$MMSEQS" rmdb "${PROFILEDB}" ${VERBOSITY}
     CURR_STEP=0
     while [ "${CURR_STEP}" -le "${STEP}" ]; do
         if [ -f "${TMP_PATH}/aln_${CURR_STEP}.checkpoint" ]; then
diff --git a/data/searchtargetprofile.sh b/data/workflow/searchtargetprofile.sh
similarity index 85%
rename from data/searchtargetprofile.sh
rename to data/workflow/searchtargetprofile.sh
index 64f86ad..5c52b95 100755
--- a/data/searchtargetprofile.sh
+++ b/data/workflow/searchtargetprofile.sh
@@ -48,9 +48,11 @@ fi
 
 
 if [ -n "${REMOVE_TMP}" ]; then
-    echo "Remove temporary files"
-    "$MMSEQS" rmdb "${TMP_PATH}/pref"
-    "$MMSEQS" rmdb "${TMP_PATH}/pref_swapped"
-    "$MMSEQS" rmdb "${TMP_PATH}/aln_swapped"
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/pref" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/pref_swapped" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/aln_swapped" ${VERBOSITY}
     rm -f "${TMP_PATH}/searchtargetprofile.sh"
 fi
diff --git a/data/taxonomy.sh b/data/workflow/taxonomy.sh
similarity index 81%
rename from data/taxonomy.sh
rename to data/workflow/taxonomy.sh
index 9f1a421..6d2d79e 100755
--- a/data/taxonomy.sh
+++ b/data/workflow/taxonomy.sh
@@ -100,27 +100,36 @@ fi
 
 
 if [ -n "${REMOVE_TMP}" ]; then
-    echo "Remove temporary files"
     rm -rf "${TMP_PATH}/tmp_hsp1"
     rm -rf "${TMP_PATH}/tmp_hsp2"
 
-    "$MMSEQS" rmdb "${TMP_PATH}/first"
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/first" ${VERBOSITY}
 
     if [ -n "${SEARCH2_PAR}" ]; then
-        "$MMSEQS" rmdb "${TMP_PATH}/top1"
-        "$MMSEQS" rmdb "${TMP_PATH}/aligned"
-        "$MMSEQS" rmdb "${TMP_PATH}/round2"
-        "$MMSEQS" rmdb "${TMP_PATH}/merged"
-        "$MMSEQS" rmdb "${TMP_PATH}/2b_ali"
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/top1" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/aligned" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/round2" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/merged" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/2b_ali" ${VERBOSITY}
         if [ -n "${APPROX_2BLCA}" ]; then
-            "$MMSEQS" rmdb "${TMP_PATH}/first_sub"
+            # shellcheck disable=SC2086
+            "$MMSEQS" rmdb "${TMP_PATH}/first_sub" ${VERBOSITY}
         fi
     fi
     if [ -n "${LCA_PAR}" ]; then
-        "$MMSEQS" rmdb "${TMP_PATH}/mapping"
-        "$MMSEQS" rmdb "${TMP_PATH}/taxa"
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/mapping" ${VERBOSITY}
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/taxa" ${VERBOSITY}
     else
-        "$MMSEQS" rmdb "${TMP_PATH}/mapping"
+        # shellcheck disable=SC2086
+        "$MMSEQS" rmdb "${TMP_PATH}/mapping" ${VERBOSITY}
     fi
 
     rm -f "${TMP_PATH}/taxonomy.sh"
diff --git a/data/translated_search.sh b/data/workflow/translated_search.sh
similarity index 91%
rename from data/translated_search.sh
rename to data/workflow/translated_search.sh
index bea5069..4f22ec6 100755
--- a/data/translated_search.sh
+++ b/data/workflow/translated_search.sh
@@ -57,9 +57,10 @@ if notExists "$3.dbtype"; then
 fi
 
 if [ -n "$REMOVE_TMP" ]; then
-    echo "Remove temporary files"
-    "$MMSEQS" rmdb "$4/q_orfs_aa"
-    "$MMSEQS" rmdb "$4/t_orfs_aa"
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "$4/q_orfs_aa" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "$4/t_orfs_aa" ${VERBOSITY}
 fi
 
 
diff --git a/data/update_clustering.sh b/data/workflow/update_clustering.sh
similarity index 93%
rename from data/update_clustering.sh
rename to data/workflow/update_clustering.sh
index 236a981..998e290 100755
--- a/data/update_clustering.sh
+++ b/data/workflow/update_clustering.sh
@@ -219,7 +219,7 @@ echo "==================================================="
 if [ -f "${TMP_PATH}/newSeqsHits.swapped.dbtype" ]; then
     if notExists "${TMP_PATH}/updatedClust.dbtype"; then
         "$MMSEQS" mergedbs "$OLDCLUST" "${TMP_PATH}/updatedClust" "$OLDCLUST" "${TMP_PATH}/newSeqsHits.swapped" \
-            || fail "Mergeffindex died"
+            || fail "mergedbs died"
     fi
 else
     if notExists "${TMP_PATH}/updatedClust"; then
@@ -281,18 +281,25 @@ fi
 
 debugWait
 if [ -n "$REMOVE_TMP" ]; then
-    echo "Remove temporary files 3/3"
     rm -f "${TMP_PATH}/newSeqs.mapped" "${TMP_PATH}/mappingSeqs.reverse" "${TMP_PATH}/newMappingSeqs"
 	rm -f  "${TMP_PATH}/noHitSeqList" "${TMP_PATH}/mappingSeqs" "${TMP_PATH}/newSeqs" "${TMP_PATH}/removedSeqs"
 
-	"$MMSEQS" rmdb "${TMP_PATH}/newSeqsHits.swapped"
-	"$MMSEQS" rmdb "${TMP_PATH}/newClusters"
-	"$MMSEQS" rmdb "${TMP_PATH}/newSeqsHits"
-	"$MMSEQS" rmdb "${TMP_PATH}/toBeClusteredSeparately"
-	"$MMSEQS" rmdb "${TMP_PATH}/NEWDB.newSeqs"
-    "$MMSEQS" rmdb "${TMP_PATH}/newSeqsHits.swapped.all"
-	"$MMSEQS" rmdb "${TMP_PATH}/OLDDB.repSeq"
-	"$MMSEQS" rmdb "${TMP_PATH}/updatedClust"
+    # shellcheck disable=SC2086
+	"$MMSEQS" rmdb "${TMP_PATH}/newSeqsHits.swapped" ${VERBOSITY}
+    # shellcheck disable=SC2086
+	"$MMSEQS" rmdb "${TMP_PATH}/newClusters" ${VERBOSITY}
+    # shellcheck disable=SC2086
+	"$MMSEQS" rmdb "${TMP_PATH}/newSeqsHits" ${VERBOSITY}
+    # shellcheck disable=SC2086
+	"$MMSEQS" rmdb "${TMP_PATH}/toBeClusteredSeparately" ${VERBOSITY}
+    # shellcheck disable=SC2086
+	"$MMSEQS" rmdb "${TMP_PATH}/NEWDB.newSeqs" ${VERBOSITY}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/newSeqsHits.swapped.all" ${VERBOSITY}
+    # shellcheck disable=SC2086
+	"$MMSEQS" rmdb "${TMP_PATH}/OLDDB.repSeq" ${VERBOSITY}
+    # shellcheck disable=SC2086
+	"$MMSEQS" rmdb "${TMP_PATH}/updatedClust" ${VERBOSITY}
 
 	rmdir "${TMP_PATH}/search" "${TMP_PATH}/cluster"
 
diff --git a/lib/alp/CMakeLists.txt b/lib/alp/CMakeLists.txt
index 6ad41c3..2c77e59 100644
--- a/lib/alp/CMakeLists.txt
+++ b/lib/alp/CMakeLists.txt
@@ -40,4 +40,4 @@ add_library(alp OBJECT
         sls_alp.hpp
         sls_normal_distr_array.hpp
         )
-set_target_properties(alp PROPERTIES COMPILE_FLAGS ${MMSEQS_CXX_FLAGS} -w LINK_FLAGS ${MMSEQS_CXX_FLAGS} -w)
+set_target_properties(alp PROPERTIES COMPILE_FLAGS "${MMSEQS_CXX_FLAGS} -w" LINK_FLAGS "${MMSEQS_CXX_FLAGS} -w")
diff --git a/lib/cacode/CMakeLists.txt b/lib/cacode/CMakeLists.txt
index 3ca6240..9b51cde 100644
--- a/lib/cacode/CMakeLists.txt
+++ b/lib/cacode/CMakeLists.txt
@@ -6,4 +6,4 @@ add_library(cacode OBJECT
         nrutil.cpp
         nrutil.h
         )
-set_target_properties(cacode PROPERTIES COMPILE_FLAGS ${MMSEQS_CXX_FLAGS} LINK_FLAGS ${MMSEQS_CXX_FLAGS})
+set_target_properties(cacode PROPERTIES COMPILE_FLAGS "${MMSEQS_CXX_FLAGS}" LINK_FLAGS "${MMSEQS_CXX_FLAGS}")
diff --git a/lib/ksw2/kseq.h b/lib/ksw2/kseq.h
index 47b354e..458511a 100644
--- a/lib/ksw2/kseq.h
+++ b/lib/ksw2/kseq.h
@@ -189,15 +189,16 @@ typedef struct __kstring_t {
 		ks->newline = 0; \
         if (seq->last_char == 0) { /* then jump to the next header line */ \
 			while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '@'); \
-            seq->offset = ks->cur_buf_pos + ks->begin; \
+            seq->headerOffset = ks->cur_buf_pos + ks->begin; \
 			if (c < 0) return c; /* end of file or error*/ \
 			seq->last_char = c; \
 		} else{  /* else: the first header char has been read in the previous call */ \
-            seq->offset = ks->cur_buf_pos + ks->begin; \
+            seq->headerOffset = ks->cur_buf_pos + ks->begin; \
 		} \
 		seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
 		if ((r=ks_getuntil(ks, 0, &seq->name, &c)) < 0) return r;  /* normal exit: EOF or error */ \
 		if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
+		seq->sequenceOffset = ks->cur_buf_pos + ks->begin; \
 		if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
 			seq->seq.m = 256; \
 			seq->seq.s = (char*)malloc(seq->seq.m); \
@@ -235,7 +236,8 @@ typedef struct __kstring_t {
 	typedef struct {							\
 		kstring_t name, comment, seq, qual;		\
 		int last_char;							\
-		size_t offset;                          \
+		size_t headerOffset;                    \
+		size_t sequenceOffset;                  \
 		bool multiline;                         \
 		kstream_t *f;							\
 	} kseq_t;
diff --git a/lib/ksw2/ksw2_extz2_sse.cpp b/lib/ksw2/ksw2_extz2_sse.cpp
index 7107300..676ed7d 100644
--- a/lib/ksw2/ksw2_extz2_sse.cpp
+++ b/lib/ksw2/ksw2_extz2_sse.cpp
@@ -37,8 +37,20 @@ See: https://github.com/lh3/minimap2
 #define KSW_SSE2_ONLY
 #endif
 
+#ifdef WASM
+#include "sse2wasm.h"
+#define __SSE2__
+#define KSW_SSE2_ONLY
+#endif
+
+#ifdef __ALTIVEC__
+#include "sse2altivec.h"
+#define __SSE2__
+#define KSW_SSE2_ONLY
+#endif
+
 #ifdef __SSE2__
-#ifndef NEON
+#if !defined(NEON) && !defined(WASM) && !defined(__ALTIVEC__)
 #include <emmintrin.h>
 #endif
 
diff --git a/lib/microtar/CMakeLists.txt b/lib/microtar/CMakeLists.txt
new file mode 100644
index 0000000..38313aa
--- /dev/null
+++ b/lib/microtar/CMakeLists.txt
@@ -0,0 +1 @@
+add_library(microtar microtar.h microtar.c)
diff --git a/lib/microtar/LICENSE b/lib/microtar/LICENSE
new file mode 100755
index 0000000..7e3bf17
--- /dev/null
+++ b/lib/microtar/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2017 rxi
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/lib/microtar/README.md b/lib/microtar/README.md
new file mode 100755
index 0000000..42acf49
--- /dev/null
+++ b/lib/microtar/README.md
@@ -0,0 +1,99 @@
+# microtar
+A lightweight tar library written in ANSI C
+
+
+## Basic Usage
+The library consists of `microtar.c` and `microtar.h`. These two files can be
+dropped into an existing project and compiled along with it.
+
+
+#### Reading
+```c
+mtar_t tar;
+mtar_header_t h;
+char *p;
+
+/* Open archive for reading */
+mtar_open(&tar, "test.tar", "r");
+
+/* Print all file names and sizes */
+while ( (mtar_read_header(&tar, &h)) != MTAR_ENULLRECORD ) {
+  printf("%s (%d bytes)\n", h.name, h.size);
+  mtar_next(&tar);
+}
+
+/* Load and print contents of file "test.txt" */
+mtar_find(&tar, "test.txt", &h);
+p = calloc(1, h.size + 1);
+mtar_read_data(&tar, p, h.size);
+printf("%s", p);
+free(p);
+
+/* Close archive */
+mtar_close(&tar);
+```
+
+#### Writing
+```c
+mtar_t tar;
+const char *str1 = "Hello world";
+const char *str2 = "Goodbye world";
+
+/* Open archive for writing */
+mtar_open(&tar, "test.tar", "w");
+
+/* Write strings to files `test1.txt` and `test2.txt` */
+mtar_write_file_header(&tar, "test1.txt", strlen(str1));
+mtar_write_data(&tar, str1, strlen(str1));
+mtar_write_file_header(&tar, "test2.txt", strlen(str2));
+mtar_write_data(&tar, str2, strlen(str2));
+
+/* Finalize -- this needs to be the last thing done before closing */
+mtar_finalize(&tar);
+
+/* Close archive */
+mtar_close(&tar);
+```
+
+
+## Error handling
+All functions which return an `int` will return `MTAR_ESUCCESS` if the operation
+is successful. If an error occurs an error value less-than-zero will be
+returned; this value can be passed to the function `mtar_strerror()` to get its
+corresponding error string.
+
+
+## Wrapping a stream
+If you want to read or write from something other than a file, the `mtar_t`
+struct can be manually initialized with your own callback functions and a
+`stream` pointer.
+
+All callback functions are passed a pointer to the `mtar_t` struct as their
+first argument. They should return `MTAR_ESUCCESS` if the operation succeeds
+without an error, or an integer below zero if an error occurs.
+
+After the `stream` field has been set, all required callbacks have been set and
+all unused fields have been zeroset the `mtar_t` struct can be safely used with
+the microtar functions. `mtar_open` *should not* be called if the `mtar_t`
+struct was initialized manually.
+
+#### Reading
+The following callbacks should be set for reading an archive from a stream:
+
+Name    | Arguments                                | Description
+--------|------------------------------------------|---------------------------
+`read`  | `mtar_t *tar, void *data, unsigned size` | Read data from the stream
+`seek`  | `mtar_t *tar, unsigned pos`              | Set the position indicator
+`close` | `mtar_t *tar`                            | Close the stream
+
+#### Writing
+The following callbacks should be set for writing an archive to a stream:
+
+Name    | Arguments                                      | Description
+--------|------------------------------------------------|---------------------
+`write` | `mtar_t *tar, const void *data, unsigned size` | Write data to the stream
+
+
+## License
+This library is free software; you can redistribute it and/or modify it under
+the terms of the MIT license. See [LICENSE](LICENSE) for details.
diff --git a/lib/microtar/microtar.c b/lib/microtar/microtar.c
new file mode 100755
index 0000000..1810a34
--- /dev/null
+++ b/lib/microtar/microtar.c
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2017 rxi
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+
+#include "microtar.h"
+
+typedef struct {
+  char name[100];
+  char mode[8];
+  char owner[8];
+  char group[8];
+  char size[12];
+  char mtime[12];
+  char checksum[8];
+  char type;
+  char linkname[100];
+  char _padding[255];
+} mtar_raw_header_t;
+
+
+static unsigned round_up(unsigned n, unsigned incr) {
+  return n + (incr - n % incr) % incr;
+}
+
+
+static unsigned checksum(const mtar_raw_header_t* rh) {
+  unsigned i;
+  unsigned char *p = (unsigned char*) rh;
+  unsigned res = 256;
+  for (i = 0; i < offsetof(mtar_raw_header_t, checksum); i++) {
+    res += p[i];
+  }
+  for (i = offsetof(mtar_raw_header_t, type); i < sizeof(*rh); i++) {
+    res += p[i];
+  }
+  return res;
+}
+
+
+static int tread(mtar_t *tar, void *data, size_t size) {
+  int err = tar->read(tar, data, size);
+  tar->pos += size;
+  return err;
+}
+
+
+static int twrite(mtar_t *tar, const void *data, size_t size) {
+  int err = tar->write(tar, data, size);
+  tar->pos += size;
+  return err;
+}
+
+
+static int write_null_bytes(mtar_t *tar, int n) {
+  int i, err;
+  char nul = '\0';
+  for (i = 0; i < n; i++) {
+    err = twrite(tar, &nul, 1);
+    if (err) {
+      return err;
+    }
+  }
+  return MTAR_ESUCCESS;
+}
+
+
+static int raw_to_header(mtar_header_t *h, const mtar_raw_header_t *rh) {
+  unsigned chksum1, chksum2;
+
+  /* If the checksum starts with a null byte we assume the record is NULL */
+  if (*rh->checksum == '\0') {
+    return MTAR_ENULLRECORD;
+  }
+
+  /* Build and compare checksum */
+  chksum1 = checksum(rh);
+  sscanf(rh->checksum, "%o", &chksum2);
+  if (chksum1 != chksum2) {
+    return MTAR_EBADCHKSUM;
+  }
+
+  /* Load raw header into header */
+  sscanf(rh->mode, "%o", &h->mode);
+  sscanf(rh->owner, "%o", &h->owner);
+  sscanf(rh->size, "%o", &h->size);
+  sscanf(rh->mtime, "%o", &h->mtime);
+  h->type = rh->type;
+  strcpy(h->name, rh->name);
+  strcpy(h->linkname, rh->linkname);
+
+  return MTAR_ESUCCESS;
+}
+
+
+static int header_to_raw(mtar_raw_header_t *rh, const mtar_header_t *h) {
+  unsigned chksum;
+
+  /* Load header into raw header */
+  memset(rh, 0, sizeof(*rh));
+  sprintf(rh->mode, "%o", h->mode);
+  sprintf(rh->owner, "%o", h->owner);
+  sprintf(rh->size, "%o", h->size);
+  sprintf(rh->mtime, "%o", h->mtime);
+  rh->type = h->type ? h->type : MTAR_TREG;
+  strcpy(rh->name, h->name);
+  strcpy(rh->linkname, h->linkname);
+
+  /* Calculate and write checksum */
+  chksum = checksum(rh);
+  sprintf(rh->checksum, "%06o", chksum);
+  rh->checksum[7] = ' ';
+
+  return MTAR_ESUCCESS;
+}
+
+
+const char* mtar_strerror(int err) {
+  switch (err) {
+    case MTAR_ESUCCESS     : return "success";
+    case MTAR_EFAILURE     : return "failure";
+    case MTAR_EOPENFAIL    : return "could not open";
+    case MTAR_EREADFAIL    : return "could not read";
+    case MTAR_EWRITEFAIL   : return "could not write";
+    case MTAR_ESEEKFAIL    : return "could not seek";
+    case MTAR_EBADCHKSUM   : return "bad checksum";
+    case MTAR_ENULLRECORD  : return "null record";
+    case MTAR_ENOTFOUND    : return "file not found";
+  }
+  return "unknown error";
+}
+
+
+static int file_write(mtar_t *tar, const void *data, size_t size) {
+  size_t res = fwrite(data, 1, size, (FILE*)tar->stream);
+  return (res == size) ? MTAR_ESUCCESS : MTAR_EWRITEFAIL;
+}
+
+static int file_read(mtar_t *tar, void *data, size_t size) {
+  size_t res = fread(data, 1, size, (FILE*)tar->stream);
+  return (res == size) ? MTAR_ESUCCESS : MTAR_EREADFAIL;
+}
+
+static int file_seek(mtar_t *tar, long offset) {
+  int res = fseek((FILE*)tar->stream, offset, SEEK_SET);
+  return (res == 0) ? MTAR_ESUCCESS : MTAR_ESEEKFAIL;
+}
+
+static int file_close(mtar_t *tar) {
+  fclose((FILE*)tar->stream);
+  return MTAR_ESUCCESS;
+}
+
+
+int mtar_open(mtar_t *tar, const char *filename, const char *mode) {
+  int err;
+  mtar_header_t h;
+
+  /* Init tar struct and functions */
+  memset(tar, 0, sizeof(*tar));
+  tar->write = file_write;
+  tar->read = file_read;
+  tar->seek = file_seek;
+  tar->close = file_close;
+
+  /* Assure mode is always binary */
+  if ( strchr(mode, 'r') ) mode = "rb";
+  if ( strchr(mode, 'w') ) mode = "wb";
+  if ( strchr(mode, 'a') ) mode = "ab";
+  /* Open file */
+  tar->stream = fopen(filename, mode);
+  if (!tar->stream) {
+    return MTAR_EOPENFAIL;
+  }
+  /* Read first header to check it is valid if mode is `r` */
+  if (*mode == 'r') {
+    err = mtar_read_header(tar, &h);
+    if (err != MTAR_ESUCCESS) {
+      mtar_close(tar);
+      return err;
+    }
+  }
+
+  /* Return ok */
+  return MTAR_ESUCCESS;
+}
+
+
+int mtar_close(mtar_t *tar) {
+  return tar->close(tar);
+}
+
+
+int mtar_seek(mtar_t *tar, long pos) {
+  int err = tar->seek(tar, pos);
+  tar->pos = pos;
+  return err;
+}
+
+
+int mtar_rewind(mtar_t *tar) {
+  tar->remaining_data = 0;
+  tar->last_header = 0;
+  return mtar_seek(tar, 0);
+}
+
+
+int mtar_next(mtar_t *tar) {
+  int err, n;
+  mtar_header_t h;
+  /* Load header */
+  err = mtar_read_header(tar, &h);
+  if (err) {
+    return err;
+  }
+  /* Seek to next record */
+  n = round_up(h.size, 512) + sizeof(mtar_raw_header_t);
+  return mtar_seek(tar, tar->pos + n);
+}
+
+
+int mtar_find(mtar_t *tar, const char *name, mtar_header_t *h) {
+  int err;
+  mtar_header_t header;
+  /* Start at beginning */
+  err = mtar_rewind(tar);
+  if (err) {
+    return err;
+  }
+  /* Iterate all files until we hit an error or find the file */
+  while ( (err = mtar_read_header(tar, &header)) == MTAR_ESUCCESS ) {
+    if ( !strcmp(header.name, name) ) {
+      if (h) {
+        *h = header;
+      }
+      return MTAR_ESUCCESS;
+    }
+    mtar_next(tar);
+  }
+  /* Return error */
+  if (err == MTAR_ENULLRECORD) {
+    err = MTAR_ENOTFOUND;
+  }
+  return err;
+}
+
+
+int mtar_read_header(mtar_t *tar, mtar_header_t *h) {
+  int err;
+  mtar_raw_header_t rh;
+  /* Save header position */
+  tar->last_header = tar->pos;
+  /* Read raw header */
+  err = tread(tar, &rh, sizeof(rh));
+  if (err) {
+    return err;
+  }
+  /* Seek back to start of header */
+  err = mtar_seek(tar, tar->last_header);
+  if (err) {
+    return err;
+  }
+  /* Load raw header into header struct and return */
+  return raw_to_header(h, &rh);
+}
+
+
+int mtar_read_data(mtar_t *tar, void *ptr, size_t size) {
+  int err;
+  /* If we have no remaining data then this is the first read, we get the size,
+   * set the remaining data and seek to the beginning of the data */
+  if (tar->remaining_data == 0) {
+    mtar_header_t h;
+    /* Read header */
+    err = mtar_read_header(tar, &h);
+    if (err) {
+      return err;
+    }
+    /* Seek past header and init remaining data */
+    err = mtar_seek(tar, tar->pos + sizeof(mtar_raw_header_t));
+    if (err) {
+      return err;
+    }
+    tar->remaining_data = h.size;
+  }
+  /* Read data */
+  err = tread(tar, ptr, size);
+  if (err) {
+    return err;
+  }
+  tar->remaining_data -= size;
+  /* If there is no remaining data we've finished reading and seek back to the
+   * header */
+  if (tar->remaining_data == 0) {
+    return mtar_seek(tar, tar->last_header);
+  }
+  return MTAR_ESUCCESS;
+}
+
+
+int mtar_write_header(mtar_t *tar, const mtar_header_t *h) {
+  mtar_raw_header_t rh;
+  /* Build raw header and write */
+  header_to_raw(&rh, h);
+  tar->remaining_data = h->size;
+  return twrite(tar, &rh, sizeof(rh));
+}
+
+
+int mtar_write_file_header(mtar_t *tar, const char *name, size_t size) {
+  mtar_header_t h;
+  /* Build header */
+  memset(&h, 0, sizeof(h));
+  strcpy(h.name, name);
+  h.size = size;
+  h.type = MTAR_TREG;
+  h.mode = 0664;
+  /* Write header */
+  return mtar_write_header(tar, &h);
+}
+
+
+int mtar_write_dir_header(mtar_t *tar, const char *name) {
+  mtar_header_t h;
+  /* Build header */
+  memset(&h, 0, sizeof(h));
+  strcpy(h.name, name);
+  h.type = MTAR_TDIR;
+  h.mode = 0775;
+  /* Write header */
+  return mtar_write_header(tar, &h);
+}
+
+
+int mtar_write_data(mtar_t *tar, const void *data, size_t size) {
+  int err;
+  /* Write data */
+  err = twrite(tar, data, size);
+  if (err) {
+    return err;
+  }
+  tar->remaining_data -= size;
+  /* Write padding if we've written all the data for this file */
+  if (tar->remaining_data == 0) {
+    return write_null_bytes(tar, round_up(tar->pos, 512) - tar->pos);
+  }
+  return MTAR_ESUCCESS;
+}
+
+
+int mtar_finalize(mtar_t *tar) {
+  /* Write two NULL records */
+  return write_null_bytes(tar, sizeof(mtar_raw_header_t) * 2);
+}
diff --git a/lib/microtar/microtar.h b/lib/microtar/microtar.h
new file mode 100755
index 0000000..b9e3ae7
--- /dev/null
+++ b/lib/microtar/microtar.h
@@ -0,0 +1,90 @@
+/**
+ * Copyright (c) 2017 rxi
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the MIT license. See `microtar.c` for details.
+ */
+
+#ifndef MICROTAR_H
+#define MICROTAR_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define MTAR_VERSION "0.1.0"
+
+enum {
+  MTAR_ESUCCESS     =  0,
+  MTAR_EFAILURE     = -1,
+  MTAR_EOPENFAIL    = -2,
+  MTAR_EREADFAIL    = -3,
+  MTAR_EWRITEFAIL   = -4,
+  MTAR_ESEEKFAIL    = -5,
+  MTAR_EBADCHKSUM   = -6,
+  MTAR_ENULLRECORD  = -7,
+  MTAR_ENOTFOUND    = -8
+};
+
+enum {
+  MTAR_TREG   = '0',
+  MTAR_TLNK   = '1',
+  MTAR_TSYM   = '2',
+  MTAR_TCHR   = '3',
+  MTAR_TBLK   = '4',
+  MTAR_TDIR   = '5',
+  MTAR_TFIFO  = '6'
+};
+
+typedef struct {
+  unsigned mode;
+  unsigned owner;
+  unsigned size;
+  unsigned mtime;
+  unsigned type;
+  char name[100];
+  char linkname[100];
+} mtar_header_t;
+
+
+typedef struct mtar_t mtar_t;
+
+struct mtar_t {
+  int (*read)(mtar_t *tar, void *data, size_t size);
+  int (*write)(mtar_t *tar, const void *data, size_t size);
+  int (*seek)(mtar_t *tar, long pos);
+  int (*close)(mtar_t *tar);
+  void *stream;
+  size_t pos;
+  size_t remaining_data;
+  size_t last_header;
+};
+
+
+const char* mtar_strerror(int err);
+
+int mtar_open(mtar_t *tar, const char *filename, const char *mode);
+int mtar_close(mtar_t *tar);
+
+int mtar_seek(mtar_t *tar, long pos);
+int mtar_rewind(mtar_t *tar);
+int mtar_next(mtar_t *tar);
+int mtar_find(mtar_t *tar, const char *name, mtar_header_t *h);
+int mtar_read_header(mtar_t *tar, mtar_header_t *h);
+int mtar_read_data(mtar_t *tar, void *ptr, size_t size);
+
+int mtar_write_header(mtar_t *tar, const mtar_header_t *h);
+int mtar_write_file_header(mtar_t *tar, const char *name, size_t size);
+int mtar_write_dir_header(mtar_t *tar, const char *name);
+int mtar_write_data(mtar_t *tar, const void *data, size_t size);
+int mtar_finalize(mtar_t *tar);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/lib/nedmalloc/CMakeLists.txt b/lib/nedmalloc/CMakeLists.txt
new file mode 100644
index 0000000..1e89fdb
--- /dev/null
+++ b/lib/nedmalloc/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_library(nedmalloc nedmalloc.c)
+set_target_properties(nedmalloc
+        PROPERTIES
+                COMPILE_FLAGS "-DREPLACE_SYSTEM_ALLOCATOR -DWIN32 -DNO_MALLINFO=1 ${MMSEQS_CXX_FLAGS}"
+                LINK_FLAGS "-DREPLACE_SYSTEM_ALLOCATOR -DWIN32 -DNO_MALLINFO=1 ${MMSEQS_CXX_FLAGS}")
diff --git a/lib/nedmalloc/License.txt b/lib/nedmalloc/License.txt
new file mode 100644
index 0000000..36b7cd9
--- /dev/null
+++ b/lib/nedmalloc/License.txt
@@ -0,0 +1,23 @@
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/lib/nedmalloc/Readme.txt b/lib/nedmalloc/Readme.txt
new file mode 100644
index 0000000..07cbf50
--- /dev/null
+++ b/lib/nedmalloc/Readme.txt
@@ -0,0 +1,136 @@
+nedalloc v1.05 15th June 2008:
+-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
+
+by Niall Douglas (http://www.nedprod.com/programs/portable/nedmalloc/)
+
+Enclosed is nedalloc, an alternative malloc implementation for multiple
+threads without lock contention based on dlmalloc v2.8.4. It is more
+or less a newer implementation of ptmalloc2, the standard allocator in
+Linux (which is based on dlmalloc v2.7.0) but also contains a per-thread
+cache for maximum CPU scalability.
+
+It is licensed under the Boost Software License which basically means
+you can do anything you like with it. This does not apply to the malloc.c.h
+file which remains copyright to others.
+
+It has been tested on win32 (x86), win64 (x64), Linux (x64), FreeBSD (x64)
+and Apple MacOS X (x86). It works very well on all of these and is very
+significantly faster than the system allocator on all of these platforms.
+
+By literally dropping in this allocator as a replacement for your system
+allocator, you can see real world improvements of up to three times in normal
+code!
+
+To use:
+-=-=-=-
+Drop in nedmalloc.h, nedmalloc.c and malloc.c.h into your project.
+Configure using the instructions in nedmalloc.h. Run and enjoy.
+
+To test, compile test.c. It will run a comparison between your system
+allocator and nedalloc and tell you how much faster nedalloc is. It also
+serves as an example of usage.
+
+Notes:
+-=-=-=
+If you want the very latest version of this allocator, get it from the
+TnFOX SVN repository at svn://svn.berlios.de/viewcvs/tnfox/trunk/src/nedmalloc
+
+Because of how nedalloc allocates an mspace per thread, it can cause
+severe bloating of memory usage under certain allocation patterns.
+You can substantially reduce this wastage by setting MAXTHREADSINPOOL
+or the threads parameter to nedcreatepool() to a fraction of the number of
+threads which would normally be in a pool at once. This will reduce
+bloating at the cost of an increase in lock contention. If allocated size
+is less than THREADCACHEMAX, locking is avoided 90-99% of the time and
+if most of your allocations are below this value, you can safely set
+MAXTHREADSINPOOL to one.
+
+You will suffer memory leakage unless you call neddisablethreadcache()
+per pool for every thread which exits. This is because nedalloc cannot
+portably know when a thread exits and thus when its thread cache can
+be returned for use by other code. Don't forget pool zero, the system pool.
+
+For C++ type allocation patterns (where the same sizes of memory are
+regularly allocated and deallocated as objects are created and destroyed),
+the threadcache always benefits performance. If however your allocation
+patterns are different, searching the threadcache may significantly slow
+down your code - as a rule of thumb, if cache utilisation is below 80%
+(see the source for neddisablethreadcache() for how to enable debug
+printing in release mode) then you should disable the thread cache for
+that thread. You can compile out the threadcache code by setting
+THREADCACHEMAX to zero.
+
+Speed comparisons:
+-=-=-=-=-=-=-=-=-=
+See Benchmarks.xls for details.
+
+The enclosed test.c can do two things: it can be a torture test or a speed
+test. The speed test is designed to be a representative synthetic
+memory allocator test. It works by randomly mixing allocations with frees
+with half of the allocation sizes being a two power multiple less than
+512 bytes (to mimic C++ stack instantiated objects) and the other half
+being a simple random value less than 16Kb.
+
+The real world code results are from Tn's TestIO benchmark. This is a
+heavily multithreaded and memory intensive benchmark with a lot of branching
+and other stuff modern processors don't like so much. As you'll note, the
+test doesn't show the benefits of the threadcache mostly due to the saturation
+of the memory bus being the limiting factor.
+
+ChangeLog:
+-=-=-=-=-=
+v1.05 15th June 2008:
+ * { 1042 } Added error check for TLSSET() and TLSFREE() macros. Thanks to
+Markus Elfring for reporting this.
+ * { 1043 } Fixed a segfault when freeing memory allocated using
+nedindependent_comalloc(). Thanks to Pavel Vozenilek for reporting this.
+
+v1.04 14th July 2007:
+ * Fixed a bug with the new optimised implementation that failed to lock
+on a realloc under certain conditions.
+ * Fixed lack of thread synchronisation in InitPool() causing pool corruption
+ * Fixed a memory leak of thread cache contents on disabling. Thanks to Earl
+Chew for reporting this.
+ * Added a sanity check for freed blocks being valid.
+ * Reworked test.c into being a torture test.
+ * Fixed GCC assembler optimisation misspecification
+
+v1.04alpha_svn915 7th October 2006:
+ * Fixed failure to unlock thread cache list if allocating a new list failed.
+Thanks to Dmitry Chichkov for reporting this. Further thanks to Aleksey Sanin.
+ * Fixed realloc(0, <size>) segfaulting. Thanks to Dmitry Chichkov for
+reporting this.
+ * Made config defines #ifndef so they can be overridden by the build system.
+Thanks to Aleksey Sanin for suggesting this.
+ * Fixed deadlock in nedprealloc() due to unnecessary locking of preferred
+thread mspace when mspace_realloc() always uses the original block's mspace
+anyway. Thanks to Aleksey Sanin for reporting this.
+ * Made some speed improvements by hacking mspace_malloc() to no longer lock
+its mspace, thus allowing the recursive mutex implementation to be removed
+with an associated speed increase. Thanks to Aleksey Sanin for suggesting this.
+ * Fixed a bug where allocating mspaces overran its max limit. Thanks to
+Aleksey Sanin for reporting this.
+
+v1.03 10th July 2006:
+ * Fixed memory corruption bug in threadcache code which only appeared with >4
+threads and in heavy use of the threadcache.
+
+v1.02 15th May 2006:
+ * Integrated dlmalloc v2.8.4, fixing the win32 memory release problem and
+improving performance still further. Speed is now up to twice the speed of v1.01
+(average is 67% faster).
+ * Fixed win32 critical section implementation. Thanks to Pavel Kuznetsov
+for reporting this.
+ * Wasn't locking mspace if all mspaces were locked. Thanks to Pavel Kuznetsov
+for reporting this.
+ * Added Apple Mac OS X support.
+
+v1.01 24th February 2006:
+ * Fixed multiprocessor scaling problems by removing sources of cache sloshing
+ * Earl Chew <earl_chew <at> agilent <dot> com> sent patches for the following:
+   1. size2binidx() wasn't working for default code path (non x86)
+   2. Fixed failure to release mspace lock under certain circumstances which
+      caused a deadlock
+
+v1.00 1st January 2006:
+ * First release
diff --git a/lib/nedmalloc/malloc.c.h b/lib/nedmalloc/malloc.c.h
new file mode 100644
index 0000000..814845d
--- /dev/null
+++ b/lib/nedmalloc/malloc.c.h
@@ -0,0 +1,5761 @@
+/*
+  This is a version (aka dlmalloc) of malloc/free/realloc written by
+  Doug Lea and released to the public domain, as explained at
+  http://creativecommons.org/licenses/publicdomain.  Send questions,
+  comments, complaints, performance data, etc to dl@cs.oswego.edu
+
+* Version pre-2.8.4 Mon Nov 27 11:22:37 2006    (dl at gee)
+
+   Note: There may be an updated version of this malloc obtainable at
+	   ftp://gee.cs.oswego.edu/pub/misc/malloc.c
+	 Check before installing!
+
+* Quickstart
+
+  This library is all in one file to simplify the most common usage:
+  ftp it, compile it (-O3), and link it into another program. All of
+  the compile-time options default to reasonable values for use on
+  most platforms.  You might later want to step through various
+  compile-time and dynamic tuning options.
+
+  For convenience, an include file for code using this malloc is at:
+     ftp://gee.cs.oswego.edu/pub/misc/malloc-2.8.4.h
+  You don't really need this .h file unless you call functions not
+  defined in your system include files.  The .h file contains only the
+  excerpts from this file needed for using this malloc on ANSI C/C++
+  systems, so long as you haven't changed compile-time options about
+  naming and tuning parameters.  If you do, then you can create your
+  own malloc.h that does include all settings by cutting at the point
+  indicated below. Note that you may already by default be using a C
+  library containing a malloc that is based on some version of this
+  malloc (for example in linux). You might still want to use the one
+  in this file to customize settings or to avoid overheads associated
+  with library versions.
+
+* Vital statistics:
+
+  Supported pointer/size_t representation:       4 or 8 bytes
+       size_t MUST be an unsigned type of the same width as
+       pointers. (If you are using an ancient system that declares
+       size_t as a signed type, or need it to be a different width
+       than pointers, you can use a previous release of this malloc
+       (e.g. 2.7.2) supporting these.)
+
+  Alignment:                                     8 bytes (default)
+       This suffices for nearly all current machines and C compilers.
+       However, you can define MALLOC_ALIGNMENT to be wider than this
+       if necessary (up to 128bytes), at the expense of using more space.
+
+  Minimum overhead per allocated chunk:   4 or  8 bytes (if 4byte sizes)
+					  8 or 16 bytes (if 8byte sizes)
+       Each malloced chunk has a hidden word of overhead holding size
+       and status information, and additional cross-check word
+       if FOOTERS is defined.
+
+  Minimum allocated size: 4-byte ptrs:  16 bytes    (including overhead)
+			  8-byte ptrs:  32 bytes    (including overhead)
+
+       Even a request for zero bytes (i.e., malloc(0)) returns a
+       pointer to something of the minimum allocatable size.
+       The maximum overhead wastage (i.e., number of extra bytes
+       allocated than were requested in malloc) is less than or equal
+       to the minimum size, except for requests >= mmap_threshold that
+       are serviced via mmap(), where the worst case wastage is about
+       32 bytes plus the remainder from a system page (the minimal
+       mmap unit); typically 4096 or 8192 bytes.
+
+  Security: static-safe; optionally more or less
+       The "security" of malloc refers to the ability of malicious
+       code to accentuate the effects of errors (for example, freeing
+       space that is not currently malloc'ed or overwriting past the
+       ends of chunks) in code that calls malloc.  This malloc
+       guarantees not to modify any memory locations below the base of
+       heap, i.e., static variables, even in the presence of usage
+       errors.  The routines additionally detect most improper frees
+       and reallocs.  All this holds as long as the static bookkeeping
+       for malloc itself is not corrupted by some other means.  This
+       is only one aspect of security -- these checks do not, and
+       cannot, detect all possible programming errors.
+
+       If FOOTERS is defined nonzero, then each allocated chunk
+       carries an additional check word to verify that it was malloced
+       from its space.  These check words are the same within each
+       execution of a program using malloc, but differ across
+       executions, so externally crafted fake chunks cannot be
+       freed. This improves security by rejecting frees/reallocs that
+       could corrupt heap memory, in addition to the checks preventing
+       writes to statics that are always on.  This may further improve
+       security at the expense of time and space overhead.  (Note that
+       FOOTERS may also be worth using with MSPACES.)
+
+       By default detected errors cause the program to abort (calling
+       "abort()"). You can override this to instead proceed past
+       errors by defining PROCEED_ON_ERROR.  In this case, a bad free
+       has no effect, and a malloc that encounters a bad address
+       caused by user overwrites will ignore the bad address by
+       dropping pointers and indices to all known memory. This may
+       be appropriate for programs that should continue if at all
+       possible in the face of programming errors, although they may
+       run out of memory because dropped memory is never reclaimed.
+
+       If you don't like either of these options, you can define
+       CORRUPTION_ERROR_ACTION and USAGE_ERROR_ACTION to do anything
+       else. And if you are sure that your program using malloc has
+       no errors or vulnerabilities, you can define INSECURE to 1,
+       which might (or might not) provide a small performance improvement.
+
+  Thread-safety: NOT thread-safe unless USE_LOCKS defined
+       When USE_LOCKS is defined, each public call to malloc, free,
+       etc is surrounded with either a pthread mutex or a win32
+       spinlock (depending on WIN32). This is not especially fast, and
+       can be a major bottleneck.  It is designed only to provide
+       minimal protection in concurrent environments, and to provide a
+       basis for extensions.  If you are using malloc in a concurrent
+       program, consider instead using nedmalloc
+       (http://www.nedprod.com/programs/portable/nedmalloc/) or
+       ptmalloc (See http://www.malloc.de), which are derived
+       from versions of this malloc.
+
+  System requirements: Any combination of MORECORE and/or MMAP/MUNMAP
+       This malloc can use unix sbrk or any emulation (invoked using
+       the CALL_MORECORE macro) and/or mmap/munmap or any emulation
+       (invoked using CALL_MMAP/CALL_MUNMAP) to get and release system
+       memory.  On most unix systems, it tends to work best if both
+       MORECORE and MMAP are enabled.  On Win32, it uses emulations
+       based on VirtualAlloc. It also uses common C library functions
+       like memset.
+
+  Compliance: I believe it is compliant with the Single Unix Specification
+       (See http://www.unix.org). Also SVID/XPG, ANSI C, and probably
+       others as well.
+
+* Overview of algorithms
+
+  This is not the fastest, most space-conserving, most portable, or
+  most tunable malloc ever written. However it is among the fastest
+  while also being among the most space-conserving, portable and
+  tunable.  Consistent balance across these factors results in a good
+  general-purpose allocator for malloc-intensive programs.
+
+  In most ways, this malloc is a best-fit allocator. Generally, it
+  chooses the best-fitting existing chunk for a request, with ties
+  broken in approximately least-recently-used order. (This strategy
+  normally maintains low fragmentation.) However, for requests less
+  than 256bytes, it deviates from best-fit when there is not an
+  exactly fitting available chunk by preferring to use space adjacent
+  to that used for the previous small request, as well as by breaking
+  ties in approximately most-recently-used order. (These enhance
+  locality of series of small allocations.)  And for very large requests
+  (>= 256Kb by default), it relies on system memory mapping
+  facilities, if supported.  (This helps avoid carrying around and
+  possibly fragmenting memory used only for large chunks.)
+
+  All operations (except malloc_stats and mallinfo) have execution
+  times that are bounded by a constant factor of the number of bits in
+  a size_t, not counting any clearing in calloc or copying in realloc,
+  or actions surrounding MORECORE and MMAP that have times
+  proportional to the number of non-contiguous regions returned by
+  system allocation routines, which is often just 1. In real-time
+  applications, you can optionally suppress segment traversals using
+  NO_SEGMENT_TRAVERSAL, which assures bounded execution even when
+  system allocators return non-contiguous spaces, at the typical
+  expense of carrying around more memory and increased fragmentation.
+
+  The implementation is not very modular and seriously overuses
+  macros. Perhaps someday all C compilers will do as good a job
+  inlining modular code as can now be done by brute-force expansion,
+  but now, enough of them seem not to.
+
+  Some compilers issue a lot of warnings about code that is
+  dead/unreachable only on some platforms, and also about intentional
+  uses of negation on unsigned types. All known cases of each can be
+  ignored.
+
+  For a longer but out of date high-level description, see
+     http://gee.cs.oswego.edu/dl/html/malloc.html
+
+* MSPACES
+  If MSPACES is defined, then in addition to malloc, free, etc.,
+  this file also defines mspace_malloc, mspace_free, etc. These
+  are versions of malloc routines that take an "mspace" argument
+  obtained using create_mspace, to control all internal bookkeeping.
+  If ONLY_MSPACES is defined, only these versions are compiled.
+  So if you would like to use this allocator for only some allocations,
+  and your system malloc for others, you can compile with
+  ONLY_MSPACES and then do something like...
+    static mspace mymspace = create_mspace(0,0); // for example
+    #define mymalloc(bytes)  mspace_malloc(mymspace, bytes)
+
+  (Note: If you only need one instance of an mspace, you can instead
+  use "USE_DL_PREFIX" to relabel the global malloc.)
+
+  You can similarly create thread-local allocators by storing
+  mspaces as thread-locals. For example:
+    static __thread mspace tlms = 0;
+    void*  tlmalloc(size_t bytes) {
+      if (tlms == 0) tlms = create_mspace(0, 0);
+      return mspace_malloc(tlms, bytes);
+    }
+    void  tlfree(void* mem) { mspace_free(tlms, mem); }
+
+  Unless FOOTERS is defined, each mspace is completely independent.
+  You cannot allocate from one and free to another (although
+  conformance is only weakly checked, so usage errors are not always
+  caught). If FOOTERS is defined, then each chunk carries around a tag
+  indicating its originating mspace, and frees are directed to their
+  originating spaces.
+
+ -------------------------  Compile-time options ---------------------------
+
+Be careful in setting #define values for numerical constants of type
+size_t. On some systems, literal values are not automatically extended
+to size_t precision unless they are explicitly casted. You can also
+use the symbolic values MAX_SIZE_T, SIZE_T_ONE, etc below.
+
+WIN32                    default: defined if _WIN32 defined
+  Defining WIN32 sets up defaults for MS environment and compilers.
+  Otherwise defaults are for unix. Beware that there seem to be some
+  cases where this malloc might not be a pure drop-in replacement for
+  Win32 malloc: Random-looking failures from Win32 GDI API's (eg;
+  SetDIBits()) may be due to bugs in some video driver implementations
+  when pixel buffers are malloc()ed, and the region spans more than
+  one VirtualAlloc()ed region. Because dlmalloc uses a small (64Kb)
+  default granularity, pixel buffers may straddle virtual allocation
+  regions more often than when using the Microsoft allocator.  You can
+  avoid this by using VirtualAlloc() and VirtualFree() for all pixel
+  buffers rather than using malloc().  If this is not possible,
+  recompile this malloc with a larger DEFAULT_GRANULARITY.
+
+MALLOC_ALIGNMENT         default: (size_t)8
+  Controls the minimum alignment for malloc'ed chunks.  It must be a
+  power of two and at least 8, even on machines for which smaller
+  alignments would suffice. It may be defined as larger than this
+  though. Note however that code and data structures are optimized for
+  the case of 8-byte alignment.
+
+MSPACES                  default: 0 (false)
+  If true, compile in support for independent allocation spaces.
+  This is only supported if HAVE_MMAP is true.
+
+ONLY_MSPACES             default: 0 (false)
+  If true, only compile in mspace versions, not regular versions.
+
+USE_LOCKS                default: 0 (false)
+  Causes each call to each public routine to be surrounded with
+  pthread or WIN32 mutex lock/unlock. (If set true, this can be
+  overridden on a per-mspace basis for mspace versions.) If set to a
+  non-zero value other than 1, locks are used, but their
+  implementation is left out, so lock functions must be supplied manually.
+
+USE_SPIN_LOCKS           default: 1 iff USE_LOCKS and on x86 using gcc or MSC
+  If true, uses custom spin locks for locking. This is currently
+  supported only for x86 platforms using gcc or recent MS compilers.
+  Otherwise, posix locks or win32 critical sections are used.
+
+FOOTERS                  default: 0
+  If true, provide extra checking and dispatching by placing
+  information in the footers of allocated chunks. This adds
+  space and time overhead.
+
+INSECURE                 default: 0
+  If true, omit checks for usage errors and heap space overwrites.
+
+USE_DL_PREFIX            default: NOT defined
+  Causes compiler to prefix all public routines with the string 'dl'.
+  This can be useful when you only want to use this malloc in one part
+  of a program, using your regular system malloc elsewhere.
+
+ABORT                    default: defined as abort()
+  Defines how to abort on failed checks.  On most systems, a failed
+  check cannot die with an "assert" or even print an informative
+  message, because the underlying print routines in turn call malloc,
+  which will fail again.  Generally, the best policy is to simply call
+  abort(). It's not very useful to do more than this because many
+  errors due to overwriting will show up as address faults (null, odd
+  addresses etc) rather than malloc-triggered checks, so will also
+  abort.  Also, most compilers know that abort() does not return, so
+  can better optimize code conditionally calling it.
+
+PROCEED_ON_ERROR           default: defined as 0 (false)
+  Controls whether detected bad addresses cause them to bypassed
+  rather than aborting. If set, detected bad arguments to free and
+  realloc are ignored. And all bookkeeping information is zeroed out
+  upon a detected overwrite of freed heap space, thus losing the
+  ability to ever return it from malloc again, but enabling the
+  application to proceed. If PROCEED_ON_ERROR is defined, the
+  static variable malloc_corruption_error_count is compiled in
+  and can be examined to see if errors have occurred. This option
+  generates slower code than the default abort policy.
+
+DEBUG                    default: NOT defined
+  The DEBUG setting is mainly intended for people trying to modify
+  this code or diagnose problems when porting to new platforms.
+  However, it may also be able to better isolate user errors than just
+  using runtime checks.  The assertions in the check routines spell
+  out in more detail the assumptions and invariants underlying the
+  algorithms.  The checking is fairly extensive, and will slow down
+  execution noticeably. Calling malloc_stats or mallinfo with DEBUG
+  set will attempt to check every non-mmapped allocated and free chunk
+  in the course of computing the summaries.
+
+ABORT_ON_ASSERT_FAILURE   default: defined as 1 (true)
+  Debugging assertion failures can be nearly impossible if your
+  version of the assert macro causes malloc to be called, which will
+  lead to a cascade of further failures, blowing the runtime stack.
+  ABORT_ON_ASSERT_FAILURE cause assertions failures to call abort(),
+  which will usually make debugging easier.
+
+MALLOC_FAILURE_ACTION     default: sets errno to ENOMEM, or no-op on win32
+  The action to take before "return 0" when malloc fails to be able to
+  return memory because there is none available.
+
+HAVE_MORECORE             default: 1 (true) unless win32 or ONLY_MSPACES
+  True if this system supports sbrk or an emulation of it.
+
+MORECORE                  default: sbrk
+  The name of the sbrk-style system routine to call to obtain more
+  memory.  See below for guidance on writing custom MORECORE
+  functions. The type of the argument to sbrk/MORECORE varies across
+  systems.  It cannot be size_t, because it supports negative
+  arguments, so it is normally the signed type of the same width as
+  size_t (sometimes declared as "intptr_t").  It doesn't much matter
+  though. Internally, we only call it with arguments less than half
+  the max value of a size_t, which should work across all reasonable
+  possibilities, although sometimes generating compiler warnings.
+
+MORECORE_CONTIGUOUS       default: 1 (true) if HAVE_MORECORE
+  If true, take advantage of fact that consecutive calls to MORECORE
+  with positive arguments always return contiguous increasing
+  addresses.  This is true of unix sbrk. It does not hurt too much to
+  set it true anyway, since malloc copes with non-contiguities.
+  Setting it false when definitely non-contiguous saves time
+  and possibly wasted space it would take to discover this though.
+
+MORECORE_CANNOT_TRIM      default: NOT defined
+  True if MORECORE cannot release space back to the system when given
+  negative arguments. This is generally necessary only if you are
+  using a hand-crafted MORECORE function that cannot handle negative
+  arguments.
+
+NO_SEGMENT_TRAVERSAL       default: 0
+  If non-zero, suppresses traversals of memory segments
+  returned by either MORECORE or CALL_MMAP. This disables
+  merging of segments that are contiguous, and selectively
+  releasing them to the OS if unused, but bounds execution times.
+
+HAVE_MMAP                 default: 1 (true)
+  True if this system supports mmap or an emulation of it.  If so, and
+  HAVE_MORECORE is not true, MMAP is used for all system
+  allocation. If set and HAVE_MORECORE is true as well, MMAP is
+  primarily used to directly allocate very large blocks. It is also
+  used as a backup strategy in cases where MORECORE fails to provide
+  space from system. Note: A single call to MUNMAP is assumed to be
+  able to unmap memory that may have be allocated using multiple calls
+  to MMAP, so long as they are adjacent.
+
+HAVE_MREMAP               default: 1 on linux, else 0
+  If true realloc() uses mremap() to re-allocate large blocks and
+  extend or shrink allocation spaces.
+
+MMAP_CLEARS               default: 1 except on WINCE.
+  True if mmap clears memory so calloc doesn't need to. This is true
+  for standard unix mmap using /dev/zero and on WIN32 except for WINCE.
+
+USE_BUILTIN_FFS            default: 0 (i.e., not used)
+  Causes malloc to use the builtin ffs() function to compute indices.
+  Some compilers may recognize and intrinsify ffs to be faster than the
+  supplied C version. Also, the case of x86 using gcc is special-cased
+  to an asm instruction, so is already as fast as it can be, and so
+  this setting has no effect. Similarly for Win32 under recent MS compilers.
+  (On most x86s, the asm version is only slightly faster than the C version.)
+
+malloc_getpagesize         default: derive from system includes, or 4096.
+  The system page size. To the extent possible, this malloc manages
+  memory from the system in page-size units.  This may be (and
+  usually is) a function rather than a constant. This is ignored
+  if WIN32, where page size is determined using getSystemInfo during
+  initialization.
+
+USE_DEV_RANDOM             default: 0 (i.e., not used)
+  Causes malloc to use /dev/random to initialize secure magic seed for
+  stamping footers. Otherwise, the current time is used.
+
+NO_MALLINFO                default: 0
+  If defined, don't compile "mallinfo". This can be a simple way
+  of dealing with mismatches between system declarations and
+  those in this file.
+
+MALLINFO_FIELD_TYPE        default: size_t
+  The type of the fields in the mallinfo struct. This was originally
+  defined as "int" in SVID etc, but is more usefully defined as
+  size_t. The value is used only if  HAVE_USR_INCLUDE_MALLOC_H is not set
+
+REALLOC_ZERO_BYTES_FREES    default: not defined
+  This should be set if a call to realloc with zero bytes should
+  be the same as a call to free. Some people think it should. Otherwise,
+  since this malloc returns a unique pointer for malloc(0), so does
+  realloc(p, 0).
+
+LACKS_UNISTD_H, LACKS_FCNTL_H, LACKS_SYS_PARAM_H, LACKS_SYS_MMAN_H
+LACKS_STRINGS_H, LACKS_STRING_H, LACKS_SYS_TYPES_H,  LACKS_ERRNO_H
+LACKS_STDLIB_H                default: NOT defined unless on WIN32
+  Define these if your system does not have these header files.
+  You might need to manually insert some of the declarations they provide.
+
+DEFAULT_GRANULARITY        default: page size if MORECORE_CONTIGUOUS,
+				system_info.dwAllocationGranularity in WIN32,
+				otherwise 64K.
+      Also settable using mallopt(M_GRANULARITY, x)
+  The unit for allocating and deallocating memory from the system.  On
+  most systems with contiguous MORECORE, there is no reason to
+  make this more than a page. However, systems with MMAP tend to
+  either require or encourage larger granularities.  You can increase
+  this value to prevent system allocation functions to be called so
+  often, especially if they are slow.  The value must be at least one
+  page and must be a power of two.  Setting to 0 causes initialization
+  to either page size or win32 region size.  (Note: In previous
+  versions of malloc, the equivalent of this option was called
+  "TOP_PAD")
+
+DEFAULT_TRIM_THRESHOLD    default: 2MB
+      Also settable using mallopt(M_TRIM_THRESHOLD, x)
+  The maximum amount of unused top-most memory to keep before
+  releasing via malloc_trim in free().  Automatic trimming is mainly
+  useful in long-lived programs using contiguous MORECORE.  Because
+  trimming via sbrk can be slow on some systems, and can sometimes be
+  wasteful (in cases where programs immediately afterward allocate
+  more large chunks) the value should be high enough so that your
+  overall system performance would improve by releasing this much
+  memory.  As a rough guide, you might set to a value close to the
+  average size of a process (program) running on your system.
+  Releasing this much memory would allow such a process to run in
+  memory.  Generally, it is worth tuning trim thresholds when a
+  program undergoes phases where several large chunks are allocated
+  and released in ways that can reuse each other's storage, perhaps
+  mixed with phases where there are no such chunks at all. The trim
+  value must be greater than page size to have any useful effect.  To
+  disable trimming completely, you can set to MAX_SIZE_T. Note that the trick
+  some people use of mallocing a huge space and then freeing it at
+  program startup, in an attempt to reserve system memory, doesn't
+  have the intended effect under automatic trimming, since that memory
+  will immediately be returned to the system.
+
+DEFAULT_MMAP_THRESHOLD       default: 256K
+      Also settable using mallopt(M_MMAP_THRESHOLD, x)
+  The request size threshold for using MMAP to directly service a
+  request. Requests of at least this size that cannot be allocated
+  using already-existing space will be serviced via mmap.  (If enough
+  normal freed space already exists it is used instead.)  Using mmap
+  segregates relatively large chunks of memory so that they can be
+  individually obtained and released from the host system. A request
+  serviced through mmap is never reused by any other request (at least
+  not directly; the system may just so happen to remap successive
+  requests to the same locations).  Segregating space in this way has
+  the benefits that: Mmapped space can always be individually released
+  back to the system, which helps keep the system level memory demands
+  of a long-lived program low.  Also, mapped memory doesn't become
+  `locked' between other chunks, as can happen with normally allocated
+  chunks, which means that even trimming via malloc_trim would not
+  release them.  However, it has the disadvantage that the space
+  cannot be reclaimed, consolidated, and then used to service later
+  requests, as happens with normal chunks.  The advantages of mmap
+  nearly always outweigh disadvantages for "large" chunks, but the
+  value of "large" may vary across systems.  The default is an
+  empirically derived value that works well in most systems. You can
+  disable mmap by setting to MAX_SIZE_T.
+
+MAX_RELEASE_CHECK_RATE   default: 4095 unless not HAVE_MMAP
+  The number of consolidated frees between checks to release
+  unused segments when freeing. When using non-contiguous segments,
+  especially with multiple mspaces, checking only for topmost space
+  doesn't always suffice to trigger trimming. To compensate for this,
+  free() will, with a period of MAX_RELEASE_CHECK_RATE (or the
+  current number of segments, if greater) try to release unused
+  segments to the OS when freeing chunks that result in
+  consolidation. The best value for this parameter is a compromise
+  between slowing down frees with relatively costly checks that
+  rarely trigger versus holding on to unused memory. To effectively
+  disable, set to MAX_SIZE_T. This may lead to a very slight speed
+  improvement at the expense of carrying around more memory.
+*/
+
+/* Version identifier to allow people to support multiple versions */
+#ifndef DLMALLOC_VERSION
+#define DLMALLOC_VERSION 20804
+#endif /* DLMALLOC_VERSION */
+
+#if defined(linux)
+#define _GNU_SOURCE 1
+#endif
+
+#ifndef WIN32
+#ifdef _WIN32
+#define WIN32 1
+#endif  /* _WIN32 */
+#ifdef _WIN32_WCE
+#define LACKS_FCNTL_H
+#define WIN32 1
+#endif /* _WIN32_WCE */
+#endif  /* WIN32 */
+#ifdef WIN32
+#define WIN32_LEAN_AND_MEAN
+#ifndef _WIN32_WINNT
+#define _WIN32_WINNT 0x403
+#endif
+#include <windows.h>
+#define HAVE_MMAP 1
+#define HAVE_MORECORE 0
+#define LACKS_UNISTD_H
+#define LACKS_SYS_PARAM_H
+#define LACKS_SYS_MMAN_H
+#define LACKS_STRING_H
+#define LACKS_STRINGS_H
+#define LACKS_SYS_TYPES_H
+#define LACKS_ERRNO_H
+#ifndef MALLOC_FAILURE_ACTION
+#define MALLOC_FAILURE_ACTION
+#endif /* MALLOC_FAILURE_ACTION */
+#ifdef _WIN32_WCE /* WINCE reportedly does not clear */
+#define MMAP_CLEARS 0
+#else
+#define MMAP_CLEARS 1
+#endif /* _WIN32_WCE */
+#endif  /* WIN32 */
+
+#if defined(DARWIN) || defined(_DARWIN)
+/* Mac OSX docs advise not to use sbrk; it seems better to use mmap */
+#ifndef HAVE_MORECORE
+#define HAVE_MORECORE 0
+#define HAVE_MMAP 1
+/* OSX allocators provide 16 byte alignment */
+#ifndef MALLOC_ALIGNMENT
+#define MALLOC_ALIGNMENT ((size_t)16U)
+#endif
+#endif  /* HAVE_MORECORE */
+#endif  /* DARWIN */
+
+#ifndef LACKS_SYS_TYPES_H
+#include <sys/types.h>  /* For size_t */
+#endif  /* LACKS_SYS_TYPES_H */
+
+/* The maximum possible size_t value has all bits set */
+#define MAX_SIZE_T           (~(size_t)0)
+
+#ifndef ONLY_MSPACES
+#define ONLY_MSPACES 0     /* define to a value */
+#else
+#define ONLY_MSPACES 1
+#endif  /* ONLY_MSPACES */
+#ifndef MSPACES
+#if ONLY_MSPACES
+#define MSPACES 1
+#else   /* ONLY_MSPACES */
+#define MSPACES 0
+#endif  /* ONLY_MSPACES */
+#endif  /* MSPACES */
+#ifndef MALLOC_ALIGNMENT
+#define MALLOC_ALIGNMENT ((size_t)8U)
+#endif  /* MALLOC_ALIGNMENT */
+#ifndef FOOTERS
+#define FOOTERS 0
+#endif  /* FOOTERS */
+#ifndef ABORT
+#define ABORT  abort()
+#endif  /* ABORT */
+#ifndef ABORT_ON_ASSERT_FAILURE
+#define ABORT_ON_ASSERT_FAILURE 1
+#endif  /* ABORT_ON_ASSERT_FAILURE */
+#ifndef PROCEED_ON_ERROR
+#define PROCEED_ON_ERROR 0
+#endif  /* PROCEED_ON_ERROR */
+#ifndef USE_LOCKS
+#define USE_LOCKS 0
+#endif  /* USE_LOCKS */
+#ifndef USE_SPIN_LOCKS
+#if USE_LOCKS && (defined(__GNUC__) && ((defined(__i386__) || defined(__x86_64__)))) || (defined(_MSC_VER) && _MSC_VER>=1310)
+#define USE_SPIN_LOCKS 1
+#else
+#define USE_SPIN_LOCKS 0
+#endif /* USE_LOCKS && ... */
+#endif /* USE_SPIN_LOCKS */
+#ifndef INSECURE
+#define INSECURE 0
+#endif  /* INSECURE */
+#ifndef HAVE_MMAP
+#define HAVE_MMAP 1
+#endif  /* HAVE_MMAP */
+#ifndef MMAP_CLEARS
+#define MMAP_CLEARS 1
+#endif  /* MMAP_CLEARS */
+#ifndef HAVE_MREMAP
+#ifdef linux
+#define HAVE_MREMAP 1
+#else   /* linux */
+#define HAVE_MREMAP 0
+#endif  /* linux */
+#endif  /* HAVE_MREMAP */
+#ifndef MALLOC_FAILURE_ACTION
+#define MALLOC_FAILURE_ACTION  errno = ENOMEM;
+#endif  /* MALLOC_FAILURE_ACTION */
+#ifndef HAVE_MORECORE
+#if ONLY_MSPACES
+#define HAVE_MORECORE 0
+#else   /* ONLY_MSPACES */
+#define HAVE_MORECORE 1
+#endif  /* ONLY_MSPACES */
+#endif  /* HAVE_MORECORE */
+#if !HAVE_MORECORE
+#define MORECORE_CONTIGUOUS 0
+#else   /* !HAVE_MORECORE */
+#define MORECORE_DEFAULT sbrk
+#ifndef MORECORE_CONTIGUOUS
+#define MORECORE_CONTIGUOUS 1
+#endif  /* MORECORE_CONTIGUOUS */
+#endif  /* HAVE_MORECORE */
+#ifndef DEFAULT_GRANULARITY
+#if (MORECORE_CONTIGUOUS || defined(WIN32))
+#define DEFAULT_GRANULARITY (0)  /* 0 means to compute in init_mparams */
+#else   /* MORECORE_CONTIGUOUS */
+#define DEFAULT_GRANULARITY ((size_t)64U * (size_t)1024U)
+#endif  /* MORECORE_CONTIGUOUS */
+#endif  /* DEFAULT_GRANULARITY */
+#ifndef DEFAULT_TRIM_THRESHOLD
+#ifndef MORECORE_CANNOT_TRIM
+#define DEFAULT_TRIM_THRESHOLD ((size_t)2U * (size_t)1024U * (size_t)1024U)
+#else   /* MORECORE_CANNOT_TRIM */
+#define DEFAULT_TRIM_THRESHOLD MAX_SIZE_T
+#endif  /* MORECORE_CANNOT_TRIM */
+#endif  /* DEFAULT_TRIM_THRESHOLD */
+#ifndef DEFAULT_MMAP_THRESHOLD
+#if HAVE_MMAP
+#define DEFAULT_MMAP_THRESHOLD ((size_t)256U * (size_t)1024U)
+#else   /* HAVE_MMAP */
+#define DEFAULT_MMAP_THRESHOLD MAX_SIZE_T
+#endif  /* HAVE_MMAP */
+#endif  /* DEFAULT_MMAP_THRESHOLD */
+#ifndef MAX_RELEASE_CHECK_RATE
+#if HAVE_MMAP
+#define MAX_RELEASE_CHECK_RATE 4095
+#else
+#define MAX_RELEASE_CHECK_RATE MAX_SIZE_T
+#endif /* HAVE_MMAP */
+#endif /* MAX_RELEASE_CHECK_RATE */
+#ifndef USE_BUILTIN_FFS
+#define USE_BUILTIN_FFS 0
+#endif  /* USE_BUILTIN_FFS */
+#ifndef USE_DEV_RANDOM
+#define USE_DEV_RANDOM 0
+#endif  /* USE_DEV_RANDOM */
+#ifndef NO_MALLINFO
+#define NO_MALLINFO 0
+#endif  /* NO_MALLINFO */
+#ifndef MALLINFO_FIELD_TYPE
+#define MALLINFO_FIELD_TYPE size_t
+#endif  /* MALLINFO_FIELD_TYPE */
+#ifndef NO_SEGMENT_TRAVERSAL
+#define NO_SEGMENT_TRAVERSAL 0
+#endif /* NO_SEGMENT_TRAVERSAL */
+
+/*
+  mallopt tuning options.  SVID/XPG defines four standard parameter
+  numbers for mallopt, normally defined in malloc.h.  None of these
+  are used in this malloc, so setting them has no effect. But this
+  malloc does support the following options.
+*/
+
+#define M_TRIM_THRESHOLD     (-1)
+#define M_GRANULARITY        (-2)
+#define M_MMAP_THRESHOLD     (-3)
+
+/* ------------------------ Mallinfo declarations ------------------------ */
+
+#if !NO_MALLINFO
+/*
+  This version of malloc supports the standard SVID/XPG mallinfo
+  routine that returns a struct containing usage properties and
+  statistics. It should work on any system that has a
+  /usr/include/malloc.h defining struct mallinfo.  The main
+  declaration needed is the mallinfo struct that is returned (by-copy)
+  by mallinfo().  The malloinfo struct contains a bunch of fields that
+  are not even meaningful in this version of malloc.  These fields are
+  are instead filled by mallinfo() with other numbers that might be of
+  interest.
+
+  HAVE_USR_INCLUDE_MALLOC_H should be set if you have a
+  /usr/include/malloc.h file that includes a declaration of struct
+  mallinfo.  If so, it is included; else a compliant version is
+  declared below.  These must be precisely the same for mallinfo() to
+  work.  The original SVID version of this struct, defined on most
+  systems with mallinfo, declares all fields as ints. But some others
+  define as unsigned long. If your system defines the fields using a
+  type of different width than listed here, you MUST #include your
+  system version and #define HAVE_USR_INCLUDE_MALLOC_H.
+*/
+
+/* #define HAVE_USR_INCLUDE_MALLOC_H */
+
+#ifdef HAVE_USR_INCLUDE_MALLOC_H
+#include "/usr/include/malloc.h"
+#else /* HAVE_USR_INCLUDE_MALLOC_H */
+#ifndef STRUCT_MALLINFO_DECLARED
+#define STRUCT_MALLINFO_DECLARED 1
+struct mallinfo {
+  MALLINFO_FIELD_TYPE arena;    /* non-mmapped space allocated from system */
+  MALLINFO_FIELD_TYPE ordblks;  /* number of free chunks */
+  MALLINFO_FIELD_TYPE smblks;   /* always 0 */
+  MALLINFO_FIELD_TYPE hblks;    /* always 0 */
+  MALLINFO_FIELD_TYPE hblkhd;   /* space in mmapped regions */
+  MALLINFO_FIELD_TYPE usmblks;  /* maximum total allocated space */
+  MALLINFO_FIELD_TYPE fsmblks;  /* always 0 */
+  MALLINFO_FIELD_TYPE uordblks; /* total allocated space */
+  MALLINFO_FIELD_TYPE fordblks; /* total free space */
+  MALLINFO_FIELD_TYPE keepcost; /* releasable (via malloc_trim) space */
+};
+#endif /* STRUCT_MALLINFO_DECLARED */
+#endif /* HAVE_USR_INCLUDE_MALLOC_H */
+#endif /* NO_MALLINFO */
+
+/*
+  Try to persuade compilers to inline. The most critical functions for
+  inlining are defined as macros, so these aren't used for them.
+*/
+
+#ifdef __MINGW64_VERSION_MAJOR
+#undef FORCEINLINE
+#endif
+#ifndef FORCEINLINE
+  #if defined(__GNUC__)
+#define FORCEINLINE __inline __attribute__ ((always_inline))
+  #elif defined(_MSC_VER)
+    #define FORCEINLINE __forceinline
+  #endif
+#endif
+#ifndef NOINLINE
+  #if defined(__GNUC__)
+    #define NOINLINE __attribute__ ((noinline))
+  #elif defined(_MSC_VER)
+    #define NOINLINE __declspec(noinline)
+  #else
+    #define NOINLINE
+  #endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#ifndef FORCEINLINE
+ #define FORCEINLINE inline
+#endif
+#endif /* __cplusplus */
+#ifndef FORCEINLINE
+ #define FORCEINLINE
+#endif
+
+#if !ONLY_MSPACES
+
+/* ------------------- Declarations of public routines ------------------- */
+
+#ifndef USE_DL_PREFIX
+#define dlcalloc               calloc
+#define dlfree                 free
+#define dlmalloc               malloc
+#define dlmemalign             memalign
+#define dlrealloc              realloc
+#define dlvalloc               valloc
+#define dlpvalloc              pvalloc
+#define dlmallinfo             mallinfo
+#define dlmallopt              mallopt
+#define dlmalloc_trim          malloc_trim
+#define dlmalloc_stats         malloc_stats
+#define dlmalloc_usable_size   malloc_usable_size
+#define dlmalloc_footprint     malloc_footprint
+#define dlmalloc_max_footprint malloc_max_footprint
+#define dlindependent_calloc   independent_calloc
+#define dlindependent_comalloc independent_comalloc
+#endif /* USE_DL_PREFIX */
+
+
+/*
+  malloc(size_t n)
+  Returns a pointer to a newly allocated chunk of at least n bytes, or
+  null if no space is available, in which case errno is set to ENOMEM
+  on ANSI C systems.
+
+  If n is zero, malloc returns a minimum-sized chunk. (The minimum
+  size is 16 bytes on most 32bit systems, and 32 bytes on 64bit
+  systems.)  Note that size_t is an unsigned type, so calls with
+  arguments that would be negative if signed are interpreted as
+  requests for huge amounts of space, which will often fail. The
+  maximum supported value of n differs across systems, but is in all
+  cases less than the maximum representable value of a size_t.
+*/
+void* dlmalloc(size_t);
+
+/*
+  free(void* p)
+  Releases the chunk of memory pointed to by p, that had been previously
+  allocated using malloc or a related routine such as realloc.
+  It has no effect if p is null. If p was not malloced or already
+  freed, free(p) will by default cause the current program to abort.
+*/
+void  dlfree(void*);
+
+/*
+  calloc(size_t n_elements, size_t element_size);
+  Returns a pointer to n_elements * element_size bytes, with all locations
+  set to zero.
+*/
+void* dlcalloc(size_t, size_t);
+
+/*
+  realloc(void* p, size_t n)
+  Returns a pointer to a chunk of size n that contains the same data
+  as does chunk p up to the minimum of (n, p's size) bytes, or null
+  if no space is available.
+
+  The returned pointer may or may not be the same as p. The algorithm
+  prefers extending p in most cases when possible, otherwise it
+  employs the equivalent of a malloc-copy-free sequence.
+
+  If p is null, realloc is equivalent to malloc.
+
+  If space is not available, realloc returns null, errno is set (if on
+  ANSI) and p is NOT freed.
+
+  if n is for fewer bytes than already held by p, the newly unused
+  space is lopped off and freed if possible.  realloc with a size
+  argument of zero (re)allocates a minimum-sized chunk.
+
+  The old unix realloc convention of allowing the last-free'd chunk
+  to be used as an argument to realloc is not supported.
+*/
+
+void* dlrealloc(void*, size_t);
+
+/*
+  memalign(size_t alignment, size_t n);
+  Returns a pointer to a newly allocated chunk of n bytes, aligned
+  in accord with the alignment argument.
+
+  The alignment argument should be a power of two. If the argument is
+  not a power of two, the nearest greater power is used.
+  8-byte alignment is guaranteed by normal malloc calls, so don't
+  bother calling memalign with an argument of 8 or less.
+
+  Overreliance on memalign is a sure way to fragment space.
+*/
+void* dlmemalign(size_t, size_t);
+
+/*
+  valloc(size_t n);
+  Equivalent to memalign(pagesize, n), where pagesize is the page
+  size of the system. If the pagesize is unknown, 4096 is used.
+*/
+void* dlvalloc(size_t);
+
+/*
+  mallopt(int parameter_number, int parameter_value)
+  Sets tunable parameters The format is to provide a
+  (parameter-number, parameter-value) pair.  mallopt then sets the
+  corresponding parameter to the argument value if it can (i.e., so
+  long as the value is meaningful), and returns 1 if successful else
+  0.  To workaround the fact that mallopt is specified to use int,
+  not size_t parameters, the value -1 is specially treated as the
+  maximum unsigned size_t value.
+
+  SVID/XPG/ANSI defines four standard param numbers for mallopt,
+  normally defined in malloc.h.  None of these are use in this malloc,
+  so setting them has no effect. But this malloc also supports other
+  options in mallopt. See below for details.  Briefly, supported
+  parameters are as follows (listed defaults are for "typical"
+  configurations).
+
+  Symbol            param #  default    allowed param values
+  M_TRIM_THRESHOLD     -1   2*1024*1024   any   (-1 disables)
+  M_GRANULARITY        -2     page size   any power of 2 >= page size
+  M_MMAP_THRESHOLD     -3      256*1024   any   (or 0 if no MMAP support)
+*/
+int dlmallopt(int, int);
+
+/*
+  malloc_footprint();
+  Returns the number of bytes obtained from the system.  The total
+  number of bytes allocated by malloc, realloc etc., is less than this
+  value. Unlike mallinfo, this function returns only a precomputed
+  result, so can be called frequently to monitor memory consumption.
+  Even if locks are otherwise defined, this function does not use them,
+  so results might not be up to date.
+*/
+size_t dlmalloc_footprint(void);
+
+/*
+  malloc_max_footprint();
+  Returns the maximum number of bytes obtained from the system. This
+  value will be greater than current footprint if deallocated space
+  has been reclaimed by the system. The peak number of bytes allocated
+  by malloc, realloc etc., is less than this value. Unlike mallinfo,
+  this function returns only a precomputed result, so can be called
+  frequently to monitor memory consumption.  Even if locks are
+  otherwise defined, this function does not use them, so results might
+  not be up to date.
+*/
+size_t dlmalloc_max_footprint(void);
+
+#if !NO_MALLINFO
+/*
+  mallinfo()
+  Returns (by copy) a struct containing various summary statistics:
+
+  arena:     current total non-mmapped bytes allocated from system
+  ordblks:   the number of free chunks
+  smblks:    always zero.
+  hblks:     current number of mmapped regions
+  hblkhd:    total bytes held in mmapped regions
+  usmblks:   the maximum total allocated space. This will be greater
+		than current total if trimming has occurred.
+  fsmblks:   always zero
+  uordblks:  current total allocated space (normal or mmapped)
+  fordblks:  total free space
+  keepcost:  the maximum number of bytes that could ideally be released
+	       back to system via malloc_trim. ("ideally" means that
+	       it ignores page restrictions etc.)
+
+  Because these fields are ints, but internal bookkeeping may
+  be kept as longs, the reported values may wrap around zero and
+  thus be inaccurate.
+*/
+struct mallinfo dlmallinfo(void);
+#endif /* NO_MALLINFO */
+
+/*
+  independent_calloc(size_t n_elements, size_t element_size, void* chunks[]);
+
+  independent_calloc is similar to calloc, but instead of returning a
+  single cleared space, it returns an array of pointers to n_elements
+  independent elements that can hold contents of size elem_size, each
+  of which starts out cleared, and can be independently freed,
+  realloc'ed etc. The elements are guaranteed to be adjacently
+  allocated (this is not guaranteed to occur with multiple callocs or
+  mallocs), which may also improve cache locality in some
+  applications.
+
+  The "chunks" argument is optional (i.e., may be null, which is
+  probably the most typical usage). If it is null, the returned array
+  is itself dynamically allocated and should also be freed when it is
+  no longer needed. Otherwise, the chunks array must be of at least
+  n_elements in length. It is filled in with the pointers to the
+  chunks.
+
+  In either case, independent_calloc returns this pointer array, or
+  null if the allocation failed.  If n_elements is zero and "chunks"
+  is null, it returns a chunk representing an array with zero elements
+  (which should be freed if not wanted).
+
+  Each element must be individually freed when it is no longer
+  needed. If you'd like to instead be able to free all at once, you
+  should instead use regular calloc and assign pointers into this
+  space to represent elements.  (In this case though, you cannot
+  independently free elements.)
+
+  independent_calloc simplifies and speeds up implementations of many
+  kinds of pools.  It may also be useful when constructing large data
+  structures that initially have a fixed number of fixed-sized nodes,
+  but the number is not known at compile time, and some of the nodes
+  may later need to be freed. For example:
+
+  struct Node { int item; struct Node* next; };
+
+  struct Node* build_list() {
+    struct Node** pool;
+    int n = read_number_of_nodes_needed();
+    if (n <= 0) return 0;
+    pool = (struct Node**)(independent_calloc(n, sizeof(struct Node), 0);
+    if (pool == 0) die();
+    // organize into a linked list...
+    struct Node* first = pool[0];
+    for (i = 0; i < n-1; ++i)
+      pool[i]->next = pool[i+1];
+    free(pool);     // Can now free the array (or not, if it is needed later)
+    return first;
+  }
+*/
+void** dlindependent_calloc(size_t, size_t, void**);
+
+/*
+  independent_comalloc(size_t n_elements, size_t sizes[], void* chunks[]);
+
+  independent_comalloc allocates, all at once, a set of n_elements
+  chunks with sizes indicated in the "sizes" array.    It returns
+  an array of pointers to these elements, each of which can be
+  independently freed, realloc'ed etc. The elements are guaranteed to
+  be adjacently allocated (this is not guaranteed to occur with
+  multiple callocs or mallocs), which may also improve cache locality
+  in some applications.
+
+  The "chunks" argument is optional (i.e., may be null). If it is null
+  the returned array is itself dynamically allocated and should also
+  be freed when it is no longer needed. Otherwise, the chunks array
+  must be of at least n_elements in length. It is filled in with the
+  pointers to the chunks.
+
+  In either case, independent_comalloc returns this pointer array, or
+  null if the allocation failed.  If n_elements is zero and chunks is
+  null, it returns a chunk representing an array with zero elements
+  (which should be freed if not wanted).
+
+  Each element must be individually freed when it is no longer
+  needed. If you'd like to instead be able to free all at once, you
+  should instead use a single regular malloc, and assign pointers at
+  particular offsets in the aggregate space. (In this case though, you
+  cannot independently free elements.)
+
+  independent_comallac differs from independent_calloc in that each
+  element may have a different size, and also that it does not
+  automatically clear elements.
+
+  independent_comalloc can be used to speed up allocation in cases
+  where several structs or objects must always be allocated at the
+  same time.  For example:
+
+  struct Head { ... }
+  struct Foot { ... }
+
+  void send_message(char* msg) {
+    int msglen = strlen(msg);
+    size_t sizes[3] = { sizeof(struct Head), msglen, sizeof(struct Foot) };
+    void* chunks[3];
+    if (independent_comalloc(3, sizes, chunks) == 0)
+      die();
+    struct Head* head = (struct Head*)(chunks[0]);
+    char*        body = (char*)(chunks[1]);
+    struct Foot* foot = (struct Foot*)(chunks[2]);
+    // ...
+  }
+
+  In general though, independent_comalloc is worth using only for
+  larger values of n_elements. For small values, you probably won't
+  detect enough difference from series of malloc calls to bother.
+
+  Overuse of independent_comalloc can increase overall memory usage,
+  since it cannot reuse existing noncontiguous small chunks that
+  might be available for some of the elements.
+*/
+void** dlindependent_comalloc(size_t, size_t*, void**);
+
+
+/*
+  pvalloc(size_t n);
+  Equivalent to valloc(minimum-page-that-holds(n)), that is,
+  round up n to nearest pagesize.
+ */
+void*  dlpvalloc(size_t);
+
+/*
+  malloc_trim(size_t pad);
+
+  If possible, gives memory back to the system (via negative arguments
+  to sbrk) if there is unused memory at the `high' end of the malloc
+  pool or in unused MMAP segments. You can call this after freeing
+  large blocks of memory to potentially reduce the system-level memory
+  requirements of a program. However, it cannot guarantee to reduce
+  memory. Under some allocation patterns, some large free blocks of
+  memory will be locked between two used chunks, so they cannot be
+  given back to the system.
+
+  The `pad' argument to malloc_trim represents the amount of free
+  trailing space to leave untrimmed. If this argument is zero, only
+  the minimum amount of memory to maintain internal data structures
+  will be left. Non-zero arguments can be supplied to maintain enough
+  trailing space to service future expected allocations without having
+  to re-obtain memory from the system.
+
+  Malloc_trim returns 1 if it actually released any memory, else 0.
+*/
+int  dlmalloc_trim(size_t);
+
+/*
+  malloc_stats();
+  Prints on stderr the amount of space obtained from the system (both
+  via sbrk and mmap), the maximum amount (which may be more than
+  current if malloc_trim and/or munmap got called), and the current
+  number of bytes allocated via malloc (or realloc, etc) but not yet
+  freed. Note that this is the number of bytes allocated, not the
+  number requested. It will be larger than the number requested
+  because of alignment and bookkeeping overhead. Because it includes
+  alignment wastage as being in use, this figure may be greater than
+  zero even when no user-level chunks are allocated.
+
+  The reported current and maximum system memory can be inaccurate if
+  a program makes other calls to system memory allocation functions
+  (normally sbrk) outside of malloc.
+
+  malloc_stats prints only the most commonly interesting statistics.
+  More information can be obtained by calling mallinfo.
+*/
+void  dlmalloc_stats(void);
+
+#endif /* ONLY_MSPACES */
+
+/*
+  malloc_usable_size(void* p);
+
+  Returns the number of bytes you can actually use in
+  an allocated chunk, which may be more than you requested (although
+  often not) due to alignment and minimum size constraints.
+  You can use this many bytes without worrying about
+  overwriting other allocated objects. This is not a particularly great
+  programming practice. malloc_usable_size can be more useful in
+  debugging and assertions, for example:
+
+  p = malloc(n);
+  assert(malloc_usable_size(p) >= 256);
+*/
+size_t dlmalloc_usable_size(void*);
+
+
+#if MSPACES
+
+/*
+  mspace is an opaque type representing an independent
+  region of space that supports mspace_malloc, etc.
+*/
+typedef void* mspace;
+
+/*
+  create_mspace creates and returns a new independent space with the
+  given initial capacity, or, if 0, the default granularity size.  It
+  returns null if there is no system memory available to create the
+  space.  If argument locked is non-zero, the space uses a separate
+  lock to control access. The capacity of the space will grow
+  dynamically as needed to service mspace_malloc requests.  You can
+  control the sizes of incremental increases of this space by
+  compiling with a different DEFAULT_GRANULARITY or dynamically
+  setting with mallopt(M_GRANULARITY, value).
+*/
+mspace create_mspace(size_t capacity, int locked);
+
+/*
+  destroy_mspace destroys the given space, and attempts to return all
+  of its memory back to the system, returning the total number of
+  bytes freed. After destruction, the results of access to all memory
+  used by the space become undefined.
+*/
+size_t destroy_mspace(mspace msp);
+
+/*
+  create_mspace_with_base uses the memory supplied as the initial base
+  of a new mspace. Part (less than 128*sizeof(size_t) bytes) of this
+  space is used for bookkeeping, so the capacity must be at least this
+  large. (Otherwise 0 is returned.) When this initial space is
+  exhausted, additional memory will be obtained from the system.
+  Destroying this space will deallocate all additionally allocated
+  space (if possible) but not the initial base.
+*/
+mspace create_mspace_with_base(void* base, size_t capacity, int locked);
+
+/*
+  mspace_mmap_large_chunks controls whether requests for large chunks
+  are allocated in their own mmapped regions, separate from others in
+  this mspace. By default this is enabled, which reduces
+  fragmentation. However, such chunks are not necessarily released to
+  the system upon destroy_mspace.  Disabling by setting to false may
+  increase fragmentation, but avoids leakage when relying on
+  destroy_mspace to release all memory allocated using this space.
+*/
+int mspace_mmap_large_chunks(mspace msp, int enable);
+
+
+/*
+  mspace_malloc behaves as malloc, but operates within
+  the given space.
+*/
+void* mspace_malloc(mspace msp, size_t bytes);
+
+/*
+  mspace_free behaves as free, but operates within
+  the given space.
+
+  If compiled with FOOTERS==1, mspace_free is not actually needed.
+  free may be called instead of mspace_free because freed chunks from
+  any space are handled by their originating spaces.
+*/
+void mspace_free(mspace msp, void* mem);
+
+/*
+  mspace_realloc behaves as realloc, but operates within
+  the given space.
+
+  If compiled with FOOTERS==1, mspace_realloc is not actually
+  needed.  realloc may be called instead of mspace_realloc because
+  realloced chunks from any space are handled by their originating
+  spaces.
+*/
+void* mspace_realloc(mspace msp, void* mem, size_t newsize);
+
+/*
+  mspace_calloc behaves as calloc, but operates within
+  the given space.
+*/
+void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size);
+
+/*
+  mspace_memalign behaves as memalign, but operates within
+  the given space.
+*/
+void* mspace_memalign(mspace msp, size_t alignment, size_t bytes);
+
+/*
+  mspace_independent_calloc behaves as independent_calloc, but
+  operates within the given space.
+*/
+void** mspace_independent_calloc(mspace msp, size_t n_elements,
+				 size_t elem_size, void* chunks[]);
+
+/*
+  mspace_independent_comalloc behaves as independent_comalloc, but
+  operates within the given space.
+*/
+void** mspace_independent_comalloc(mspace msp, size_t n_elements,
+				   size_t sizes[], void* chunks[]);
+
+/*
+  mspace_footprint() returns the number of bytes obtained from the
+  system for this space.
+*/
+size_t mspace_footprint(mspace msp);
+
+/*
+  mspace_max_footprint() returns the peak number of bytes obtained from the
+  system for this space.
+*/
+size_t mspace_max_footprint(mspace msp);
+
+
+#if !NO_MALLINFO
+/*
+  mspace_mallinfo behaves as mallinfo, but reports properties of
+  the given space.
+*/
+struct mallinfo mspace_mallinfo(mspace msp);
+#endif /* NO_MALLINFO */
+
+/*
+  malloc_usable_size(void* p) behaves the same as malloc_usable_size;
+*/
+  size_t mspace_usable_size(void* mem);
+
+/*
+  mspace_malloc_stats behaves as malloc_stats, but reports
+  properties of the given space.
+*/
+void mspace_malloc_stats(mspace msp);
+
+/*
+  mspace_trim behaves as malloc_trim, but
+  operates within the given space.
+*/
+int mspace_trim(mspace msp, size_t pad);
+
+/*
+  An alias for mallopt.
+*/
+int mspace_mallopt(int, int);
+
+#endif /* MSPACES */
+
+#ifdef __cplusplus
+};  /* end of extern "C" */
+#endif /* __cplusplus */
+
+/*
+  ========================================================================
+  To make a fully customizable malloc.h header file, cut everything
+  above this line, put into file malloc.h, edit to suit, and #include it
+  on the next line, as well as in programs that use this malloc.
+  ========================================================================
+*/
+
+/* #include "malloc.h" */
+
+/*------------------------------ internal #includes ---------------------- */
+
+#ifdef WIN32
+#ifndef __GNUC__
+#pragma warning( disable : 4146 ) /* no "unsigned" warnings */
+#endif
+#endif /* WIN32 */
+
+#include <stdio.h>       /* for printing in malloc_stats */
+
+#ifndef LACKS_ERRNO_H
+#include <errno.h>       /* for MALLOC_FAILURE_ACTION */
+#endif /* LACKS_ERRNO_H */
+#if FOOTERS
+#include <time.h>        /* for magic initialization */
+#endif /* FOOTERS */
+#ifndef LACKS_STDLIB_H
+#include <stdlib.h>      /* for abort() */
+#endif /* LACKS_STDLIB_H */
+#ifdef DEBUG
+#if ABORT_ON_ASSERT_FAILURE
+#define assert(x) if(!(x)) ABORT
+#else /* ABORT_ON_ASSERT_FAILURE */
+#include <assert.h>
+#endif /* ABORT_ON_ASSERT_FAILURE */
+#else  /* DEBUG */
+#ifndef assert
+#define assert(x)
+#endif
+#define DEBUG 0
+#endif /* DEBUG */
+#ifndef LACKS_STRING_H
+#include <string.h>      /* for memset etc */
+#endif  /* LACKS_STRING_H */
+#if USE_BUILTIN_FFS
+#ifndef LACKS_STRINGS_H
+#include <strings.h>     /* for ffs */
+#endif /* LACKS_STRINGS_H */
+#endif /* USE_BUILTIN_FFS */
+#if HAVE_MMAP
+#ifndef LACKS_SYS_MMAN_H
+#include <sys/mman.h>    /* for mmap */
+#endif /* LACKS_SYS_MMAN_H */
+#ifndef LACKS_FCNTL_H
+#include <fcntl.h>
+#endif /* LACKS_FCNTL_H */
+#endif /* HAVE_MMAP */
+#ifndef LACKS_UNISTD_H
+#include <unistd.h>     /* for sbrk, sysconf */
+#else /* LACKS_UNISTD_H */
+#if !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__NetBSD__)
+extern void*     sbrk(ptrdiff_t);
+#endif /* FreeBSD etc */
+#endif /* LACKS_UNISTD_H */
+
+/* Declarations for locking */
+#if USE_LOCKS
+#ifndef WIN32
+#include <pthread.h>
+#if defined (__SVR4) && defined (__sun)  /* solaris */
+#include <thread.h>
+#endif /* solaris */
+#else
+#ifndef _M_AMD64
+/* These are already defined on AMD64 builds */
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+#ifndef __MINGW32__
+LONG __cdecl _InterlockedCompareExchange(LONG volatile *Dest, LONG Exchange, LONG Comp);
+LONG __cdecl _InterlockedExchange(LONG volatile *Target, LONG Value);
+#endif
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* _M_AMD64 */
+#ifndef __MINGW32__
+#pragma intrinsic (_InterlockedCompareExchange)
+#pragma intrinsic (_InterlockedExchange)
+#else
+  /* --[ start GCC compatibility ]----------------------------------------------
+   * Compatibility <intrin_x86.h> header for GCC -- GCC equivalents of intrinsic
+   * Microsoft Visual C++ functions. Originally developed for the ReactOS
+   * (<http://www.reactos.org/>) and TinyKrnl (<http://www.tinykrnl.org/>)
+   * projects.
+   *
+   * Copyright (c) 2006 KJK::Hyperion <hackbunny@reactos.com>
+   *
+   * Permission is hereby granted, free of charge, to any person obtaining a
+   * copy of this software and associated documentation files (the "Software"),
+   * to deal in the Software without restriction, including without limitation
+   * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+   * and/or sell copies of the Software, and to permit persons to whom the
+   * Software is furnished to do so, subject to the following conditions:
+   *
+   * The above copyright notice and this permission notice shall be included in
+   * all copies or substantial portions of the Software.
+   *
+   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+   * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   * DEALINGS IN THE SOFTWARE.
+   */
+
+  /*** Atomic operations ***/
+  #if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) > 40100
+    #undef _ReadWriteBarrier
+    #define _ReadWriteBarrier() __sync_synchronize()
+  #else
+    static __inline__ __attribute__((always_inline)) long __sync_lock_test_and_set(volatile long * const Target, const long Value)
+    {
+      long res;
+      __asm__ __volatile__("xchg%z0 %2, %0" : "=g" (*(Target)), "=r" (res) : "1" (Value));
+      return res;
+    }
+    static void __inline__ __attribute__((always_inline)) _MemoryBarrier(void)
+    {
+      __asm__ __volatile__("" : : : "memory");
+    }
+    #define _ReadWriteBarrier() _MemoryBarrier()
+  #endif
+  /* BUGBUG: GCC only supports full barriers */
+  static __inline__ __attribute__((always_inline)) long _InterlockedExchange(volatile long * const Target, const long Value)
+  {
+    /* NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier */
+    _ReadWriteBarrier();
+    return __sync_lock_test_and_set(Target, Value);
+  }
+  /* --[ end GCC compatibility ]---------------------------------------------- */
+#endif
+#define interlockedcompareexchange _InterlockedCompareExchange
+#define interlockedexchange _InterlockedExchange
+#endif /* Win32 */
+#endif /* USE_LOCKS */
+
+/* Declarations for bit scanning on win32 */
+#if defined(_MSC_VER) && _MSC_VER>=1300
+#ifndef BitScanForward	/* Try to avoid pulling in WinNT.h */
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+unsigned char _BitScanForward(unsigned long *index, unsigned long mask);
+unsigned char _BitScanReverse(unsigned long *index, unsigned long mask);
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#define BitScanForward _BitScanForward
+#define BitScanReverse _BitScanReverse
+#pragma intrinsic(_BitScanForward)
+#pragma intrinsic(_BitScanReverse)
+#endif /* BitScanForward */
+#endif /* defined(_MSC_VER) && _MSC_VER>=1300 */
+
+#ifndef WIN32
+#ifndef malloc_getpagesize
+#  ifdef _SC_PAGESIZE         /* some SVR4 systems omit an underscore */
+#    ifndef _SC_PAGE_SIZE
+#      define _SC_PAGE_SIZE _SC_PAGESIZE
+#    endif
+#  endif
+#  ifdef _SC_PAGE_SIZE
+#    define malloc_getpagesize sysconf(_SC_PAGE_SIZE)
+#  else
+#    if defined(BSD) || defined(DGUX) || defined(HAVE_GETPAGESIZE)
+       extern size_t getpagesize();
+#      define malloc_getpagesize getpagesize()
+#    else
+#      ifdef WIN32 /* use supplied emulation of getpagesize */
+#        define malloc_getpagesize getpagesize()
+#      else
+#        ifndef LACKS_SYS_PARAM_H
+#          include <sys/param.h>
+#        endif
+#        ifdef EXEC_PAGESIZE
+#          define malloc_getpagesize EXEC_PAGESIZE
+#        else
+#          ifdef NBPG
+#            ifndef CLSIZE
+#              define malloc_getpagesize NBPG
+#            else
+#              define malloc_getpagesize (NBPG * CLSIZE)
+#            endif
+#          else
+#            ifdef NBPC
+#              define malloc_getpagesize NBPC
+#            else
+#              ifdef PAGESIZE
+#                define malloc_getpagesize PAGESIZE
+#              else /* just guess */
+#                define malloc_getpagesize ((size_t)4096U)
+#              endif
+#            endif
+#          endif
+#        endif
+#      endif
+#    endif
+#  endif
+#endif
+#endif
+
+
+
+/* ------------------- size_t and alignment properties -------------------- */
+
+/* The byte and bit size of a size_t */
+#define SIZE_T_SIZE         (sizeof(size_t))
+#define SIZE_T_BITSIZE      (sizeof(size_t) << 3)
+
+/* Some constants coerced to size_t */
+/* Annoying but necessary to avoid errors on some platforms */
+#define SIZE_T_ZERO         ((size_t)0)
+#define SIZE_T_ONE          ((size_t)1)
+#define SIZE_T_TWO          ((size_t)2)
+#define SIZE_T_FOUR         ((size_t)4)
+#define TWO_SIZE_T_SIZES    (SIZE_T_SIZE<<1)
+#define FOUR_SIZE_T_SIZES   (SIZE_T_SIZE<<2)
+#define SIX_SIZE_T_SIZES    (FOUR_SIZE_T_SIZES+TWO_SIZE_T_SIZES)
+#define HALF_MAX_SIZE_T     (MAX_SIZE_T / 2U)
+
+/* The bit mask value corresponding to MALLOC_ALIGNMENT */
+#define CHUNK_ALIGN_MASK    (MALLOC_ALIGNMENT - SIZE_T_ONE)
+
+/* True if address a has acceptable alignment */
+#define is_aligned(A)       (((size_t)((A)) & (CHUNK_ALIGN_MASK)) == 0)
+
+/* the number of bytes to offset an address to align it */
+#define align_offset(A)\
+ ((((size_t)(A) & CHUNK_ALIGN_MASK) == 0)? 0 :\
+  ((MALLOC_ALIGNMENT - ((size_t)(A) & CHUNK_ALIGN_MASK)) & CHUNK_ALIGN_MASK))
+
+/* -------------------------- MMAP preliminaries ------------------------- */
+
+/*
+   If HAVE_MORECORE or HAVE_MMAP are false, we just define calls and
+   checks to fail so compiler optimizer can delete code rather than
+   using so many "#if"s.
+*/
+
+
+/* MORECORE and MMAP must return MFAIL on failure */
+#define MFAIL                ((void*)(MAX_SIZE_T))
+#define CMFAIL               ((char*)(MFAIL)) /* defined for convenience */
+
+#if HAVE_MMAP
+
+#ifndef WIN32
+#define MUNMAP_DEFAULT(a, s)  munmap((a), (s))
+#define MMAP_PROT            (PROT_READ|PROT_WRITE)
+#if !defined(MAP_ANONYMOUS) && defined(MAP_ANON)
+#define MAP_ANONYMOUS        MAP_ANON
+#endif /* MAP_ANON */
+#ifdef MAP_ANONYMOUS
+#define MMAP_FLAGS           (MAP_PRIVATE|MAP_ANONYMOUS)
+#define MMAP_DEFAULT(s)       mmap(0, (s), MMAP_PROT, MMAP_FLAGS, -1, 0)
+#else /* MAP_ANONYMOUS */
+/*
+   Nearly all versions of mmap support MAP_ANONYMOUS, so the following
+   is unlikely to be needed, but is supplied just in case.
+*/
+#define MMAP_FLAGS           (MAP_PRIVATE)
+static int dev_zero_fd = -1; /* Cached file descriptor for /dev/zero. */
+#define MMAP_DEFAULT(s) ((dev_zero_fd < 0) ? \
+	   (dev_zero_fd = open("/dev/zero", O_RDWR), \
+	    mmap(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0)) : \
+	    mmap(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0))
+#endif /* MAP_ANONYMOUS */
+
+#define DIRECT_MMAP_DEFAULT(s) MMAP_DEFAULT(s)
+
+#else /* WIN32 */
+
+/* Win32 MMAP via VirtualAlloc */
+static FORCEINLINE void* win32mmap(size_t size) {
+  void* ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
+  return (ptr != 0)? ptr: MFAIL;
+}
+
+/* For direct MMAP, use MEM_TOP_DOWN to minimize interference */
+static FORCEINLINE void* win32direct_mmap(size_t size) {
+  void* ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT|MEM_TOP_DOWN,
+			   PAGE_READWRITE);
+  return (ptr != 0)? ptr: MFAIL;
+}
+
+/* This function supports releasing coalesced segments */
+static FORCEINLINE int win32munmap(void* ptr, size_t size) {
+  MEMORY_BASIC_INFORMATION minfo;
+  char* cptr = (char*)ptr;
+  while (size) {
+    if (VirtualQuery(cptr, &minfo, sizeof(minfo)) == 0)
+      return -1;
+    if (minfo.BaseAddress != cptr || minfo.AllocationBase != cptr ||
+	minfo.State != MEM_COMMIT || minfo.RegionSize > size)
+      return -1;
+    if (VirtualFree(cptr, 0, MEM_RELEASE) == 0)
+      return -1;
+    cptr += minfo.RegionSize;
+    size -= minfo.RegionSize;
+  }
+  return 0;
+}
+
+#define MMAP_DEFAULT(s)             win32mmap(s)
+#define MUNMAP_DEFAULT(a, s)        win32munmap((a), (s))
+#define DIRECT_MMAP_DEFAULT(s)      win32direct_mmap(s)
+#endif /* WIN32 */
+#endif /* HAVE_MMAP */
+
+#if HAVE_MREMAP
+#ifndef WIN32
+#define MREMAP_DEFAULT(addr, osz, nsz, mv) mremap((addr), (osz), (nsz), (mv))
+#endif /* WIN32 */
+#endif /* HAVE_MREMAP */
+
+
+/**
+ * Define CALL_MORECORE
+ */
+#if HAVE_MORECORE
+    #ifdef MORECORE
+	#define CALL_MORECORE(S)    MORECORE(S)
+    #else  /* MORECORE */
+	#define CALL_MORECORE(S)    MORECORE_DEFAULT(S)
+    #endif /* MORECORE */
+#else  /* HAVE_MORECORE */
+    #define CALL_MORECORE(S)        MFAIL
+#endif /* HAVE_MORECORE */
+
+/**
+ * Define CALL_MMAP/CALL_MUNMAP/CALL_DIRECT_MMAP
+ */
+#if HAVE_MMAP
+    #define IS_MMAPPED_BIT          (SIZE_T_ONE)
+    #define USE_MMAP_BIT            (SIZE_T_ONE)
+
+    #ifdef MMAP
+	#define CALL_MMAP(s)        MMAP(s)
+    #else /* MMAP */
+	#define CALL_MMAP(s)        MMAP_DEFAULT(s)
+    #endif /* MMAP */
+    #ifdef MUNMAP
+	#define CALL_MUNMAP(a, s)   MUNMAP((a), (s))
+    #else /* MUNMAP */
+	#define CALL_MUNMAP(a, s)   MUNMAP_DEFAULT((a), (s))
+    #endif /* MUNMAP */
+    #ifdef DIRECT_MMAP
+	#define CALL_DIRECT_MMAP(s) DIRECT_MMAP(s)
+    #else /* DIRECT_MMAP */
+	#define CALL_DIRECT_MMAP(s) DIRECT_MMAP_DEFAULT(s)
+    #endif /* DIRECT_MMAP */
+#else  /* HAVE_MMAP */
+    #define IS_MMAPPED_BIT          (SIZE_T_ZERO)
+    #define USE_MMAP_BIT            (SIZE_T_ZERO)
+
+    #define MMAP(s)                 MFAIL
+    #define MUNMAP(a, s)            (-1)
+    #define DIRECT_MMAP(s)          MFAIL
+    #define CALL_DIRECT_MMAP(s)     DIRECT_MMAP(s)
+    #define CALL_MMAP(s)            MMAP(s)
+    #define CALL_MUNMAP(a, s)       MUNMAP((a), (s))
+#endif /* HAVE_MMAP */
+
+/**
+ * Define CALL_MREMAP
+ */
+#if HAVE_MMAP && HAVE_MREMAP
+    #ifdef MREMAP
+	#define CALL_MREMAP(addr, osz, nsz, mv) MREMAP((addr), (osz), (nsz), (mv))
+    #else /* MREMAP */
+	#define CALL_MREMAP(addr, osz, nsz, mv) MREMAP_DEFAULT((addr), (osz), (nsz), (mv))
+    #endif /* MREMAP */
+#else  /* HAVE_MMAP && HAVE_MREMAP */
+    #define CALL_MREMAP(addr, osz, nsz, mv)     MFAIL
+#endif /* HAVE_MMAP && HAVE_MREMAP */
+
+/* mstate bit set if contiguous morecore disabled or failed */
+#define USE_NONCONTIGUOUS_BIT (4U)
+
+/* segment bit set in create_mspace_with_base */
+#define EXTERN_BIT            (8U)
+
+
+/* --------------------------- Lock preliminaries ------------------------ */
+
+/*
+  When locks are defined, there is one global lock, plus
+  one per-mspace lock.
+
+  The global lock_ensures that mparams.magic and other unique
+  mparams values are initialized only once. It also protects
+  sequences of calls to MORECORE.  In many cases sys_alloc requires
+  two calls, that should not be interleaved with calls by other
+  threads.  This does not protect against direct calls to MORECORE
+  by other threads not using this lock, so there is still code to
+  cope the best we can on interference.
+
+  Per-mspace locks surround calls to malloc, free, etc.  To enable use
+  in layered extensions, per-mspace locks are reentrant.
+
+  Because lock-protected regions generally have bounded times, it is
+  OK to use the supplied simple spinlocks in the custom versions for
+  x86.
+
+  If USE_LOCKS is > 1, the definitions of lock routines here are
+  bypassed, in which case you will need to define at least
+  INITIAL_LOCK, ACQUIRE_LOCK, RELEASE_LOCK and possibly TRY_LOCK
+  (which is not used in this malloc, but commonly needed in
+  extensions.)
+*/
+
+#if USE_LOCKS == 1
+
+#if USE_SPIN_LOCKS
+#ifndef WIN32
+
+/* Custom pthread-style spin locks on x86 and x64 for gcc */
+struct pthread_mlock_t {
+  volatile unsigned int l;
+  volatile unsigned int c;
+  volatile pthread_t threadid;
+};
+#define MLOCK_T struct        pthread_mlock_t
+#define CURRENT_THREAD        pthread_self()
+#define INITIAL_LOCK(sl)      (memset(sl, 0, sizeof(MLOCK_T)), 0)
+#define ACQUIRE_LOCK(sl)      pthread_acquire_lock(sl)
+#define RELEASE_LOCK(sl)      pthread_release_lock(sl)
+#define TRY_LOCK(sl)          pthread_try_lock(sl)
+#define SPINS_PER_YIELD       63
+
+static MLOCK_T malloc_global_mutex = { 0, 0, 0};
+
+static FORCEINLINE int pthread_acquire_lock (MLOCK_T *sl) {
+  int spins = 0;
+  volatile unsigned int* lp = &sl->l;
+  for (;;) {
+    if (*lp != 0) {
+      if (sl->threadid == CURRENT_THREAD) {
+	++sl->c;
+	return 0;
+      }
+    }
+    else {
+      /* place args to cmpxchgl in locals to evade oddities in some gccs */
+      int cmp = 0;
+      int val = 1;
+      int ret;
+      __asm__ __volatile__  ("lock; cmpxchgl %1, %2"
+			     : "=a" (ret)
+			     : "r" (val), "m" (*(lp)), "0"(cmp)
+			     : "memory", "cc");
+      if (!ret) {
+	assert(!sl->threadid);
+	sl->c = 1;
+	sl->threadid = CURRENT_THREAD;
+	return 0;
+      }
+      if ((++spins & SPINS_PER_YIELD) == 0) {
+#if defined (__SVR4) && defined (__sun) /* solaris */
+	thr_yield();
+#else
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__APPLE__)
+	sched_yield();
+#else  /* no-op yield on unknown systems */
+	;
+#endif /* __linux__ || __FreeBSD__ || __APPLE__ */
+#endif /* solaris */
+      }
+    }
+  }
+}
+
+static FORCEINLINE void pthread_release_lock (MLOCK_T *sl) {
+  assert(sl->l != 0);
+  assert(sl->threadid == CURRENT_THREAD);
+  if (--sl->c == 0) {
+    volatile unsigned int* lp = &sl->l;
+    int prev = 0;
+    int ret;
+    sl->threadid = 0;
+    __asm__ __volatile__ ("lock; xchgl %0, %1"
+			  : "=r" (ret)
+			  : "m" (*(lp)), "0"(prev)
+			  : "memory");
+  }
+}
+
+static FORCEINLINE int pthread_try_lock (MLOCK_T *sl) {
+  volatile unsigned int* lp = &sl->l;
+  if (*lp != 0) {
+      if (sl->threadid == CURRENT_THREAD) {
+	++sl->c;
+	return 1;
+      }
+  }
+  else {
+    int cmp = 0;
+    int val = 1;
+    int ret;
+    __asm__ __volatile__  ("lock; cmpxchgl %1, %2"
+			   : "=a" (ret)
+			   : "r" (val), "m" (*(lp)), "0"(cmp)
+			   : "memory", "cc");
+    if (!ret) {
+      assert(!sl->threadid);
+      sl->c = 1;
+      sl->threadid = CURRENT_THREAD;
+      return 1;
+    }
+  }
+  return 0;
+}
+
+
+#else /* WIN32 */
+/* Custom win32-style spin locks on x86 and x64 for MSC */
+struct win32_mlock_t
+{
+  volatile long l;
+  volatile unsigned int c;
+  volatile long threadid;
+};
+
+static inline int return_0(int i) { return 0; }
+#define MLOCK_T               struct win32_mlock_t
+#define CURRENT_THREAD        win32_getcurrentthreadid()
+#define INITIAL_LOCK(sl)      (memset(sl, 0, sizeof(MLOCK_T)), return_0(0))
+#define ACQUIRE_LOCK(sl)      win32_acquire_lock(sl)
+#define RELEASE_LOCK(sl)      win32_release_lock(sl)
+#define TRY_LOCK(sl)          win32_try_lock(sl)
+#define SPINS_PER_YIELD       63
+
+static MLOCK_T malloc_global_mutex = { 0, 0, 0};
+
+static FORCEINLINE long win32_getcurrentthreadid(void) {
+#ifdef _MSC_VER
+#if defined(_M_IX86)
+  long *threadstruct=(long *)__readfsdword(0x18);
+  long threadid=threadstruct[0x24/sizeof(long)];
+  return threadid;
+#elif defined(_M_X64)
+  /* todo */
+  return GetCurrentThreadId();
+#else
+  return GetCurrentThreadId();
+#endif
+#else
+  return GetCurrentThreadId();
+#endif
+}
+
+static FORCEINLINE int win32_acquire_lock (MLOCK_T *sl) {
+  int spins = 0;
+  for (;;) {
+    if (sl->l != 0) {
+      if (sl->threadid == CURRENT_THREAD) {
+	++sl->c;
+	return 0;
+      }
+    }
+    else {
+      if (!interlockedexchange(&sl->l, 1)) {
+	assert(!sl->threadid);
+		sl->c=CURRENT_THREAD;
+	sl->threadid = CURRENT_THREAD;
+	sl->c = 1;
+	return 0;
+      }
+    }
+    if ((++spins & SPINS_PER_YIELD) == 0)
+      SleepEx(0, FALSE);
+  }
+}
+
+static FORCEINLINE void win32_release_lock (MLOCK_T *sl) {
+  assert(sl->threadid == CURRENT_THREAD);
+  assert(sl->l != 0);
+  if (--sl->c == 0) {
+    sl->threadid = 0;
+    interlockedexchange (&sl->l, 0);
+  }
+}
+
+static FORCEINLINE int win32_try_lock (MLOCK_T *sl) {
+  if(sl->l != 0) {
+      if (sl->threadid == CURRENT_THREAD) {
+	++sl->c;
+	return 1;
+      }
+  }
+  else {
+    if (!interlockedexchange(&sl->l, 1)){
+      assert(!sl->threadid);
+      sl->threadid = CURRENT_THREAD;
+      sl->c = 1;
+      return 1;
+    }
+  }
+  return 0;
+}
+
+#endif /* WIN32 */
+#else /* USE_SPIN_LOCKS */
+
+#ifndef WIN32
+/* pthreads-based locks */
+
+#define MLOCK_T               pthread_mutex_t
+#define CURRENT_THREAD        pthread_self()
+#define INITIAL_LOCK(sl)      pthread_init_lock(sl)
+#define ACQUIRE_LOCK(sl)      pthread_mutex_lock(sl)
+#define RELEASE_LOCK(sl)      pthread_mutex_unlock(sl)
+#define TRY_LOCK(sl)          (!pthread_mutex_trylock(sl))
+
+static MLOCK_T malloc_global_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+/* Cope with old-style linux recursive lock initialization by adding */
+/* skipped internal declaration from pthread.h */
+#ifdef linux
+#ifndef PTHREAD_MUTEX_RECURSIVE
+extern int pthread_mutexattr_setkind_np __P ((pthread_mutexattr_t *__attr,
+					   int __kind));
+#define PTHREAD_MUTEX_RECURSIVE PTHREAD_MUTEX_RECURSIVE_NP
+#define pthread_mutexattr_settype(x,y) pthread_mutexattr_setkind_np(x,y)
+#endif
+#endif
+
+static int pthread_init_lock (MLOCK_T *sl) {
+  pthread_mutexattr_t attr;
+  if (pthread_mutexattr_init(&attr)) return 1;
+  if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)) return 1;
+  if (pthread_mutex_init(sl, &attr)) return 1;
+  if (pthread_mutexattr_destroy(&attr)) return 1;
+  return 0;
+}
+
+#else /* WIN32 */
+/* Win32 critical sections */
+#define MLOCK_T               CRITICAL_SECTION
+#define CURRENT_THREAD        GetCurrentThreadId()
+#define INITIAL_LOCK(s)       (!InitializeCriticalSectionAndSpinCount((s), 0x80000000|4000))
+#define ACQUIRE_LOCK(s)       (EnterCriticalSection(s), 0)
+#define RELEASE_LOCK(s)       LeaveCriticalSection(s)
+#define TRY_LOCK(s)           TryEnterCriticalSection(s)
+#define NEED_GLOBAL_LOCK_INIT
+
+static MLOCK_T malloc_global_mutex;
+static volatile long malloc_global_mutex_status;
+
+/* Use spin loop to initialize global lock */
+static void init_malloc_global_mutex() {
+  for (;;) {
+    long stat = malloc_global_mutex_status;
+    if (stat > 0)
+      return;
+    /* transition to < 0 while initializing, then to > 0) */
+    if (stat == 0 &&
+	interlockedcompareexchange(&malloc_global_mutex_status, -1, 0) == 0) {
+      InitializeCriticalSection(&malloc_global_mutex);
+      interlockedexchange(&malloc_global_mutex_status,1);
+      return;
+    }
+    SleepEx(0, FALSE);
+  }
+}
+
+#endif /* WIN32 */
+#endif /* USE_SPIN_LOCKS */
+#endif /* USE_LOCKS == 1 */
+
+/* -----------------------  User-defined locks ------------------------ */
+
+#if USE_LOCKS > 1
+/* Define your own lock implementation here */
+/* #define INITIAL_LOCK(sl)  ... */
+/* #define ACQUIRE_LOCK(sl)  ... */
+/* #define RELEASE_LOCK(sl)  ... */
+/* #define TRY_LOCK(sl) ... */
+/* static MLOCK_T malloc_global_mutex = ... */
+#endif /* USE_LOCKS > 1 */
+
+/* -----------------------  Lock-based state ------------------------ */
+
+#if USE_LOCKS
+#define USE_LOCK_BIT               (2U)
+#else  /* USE_LOCKS */
+#define USE_LOCK_BIT               (0U)
+#define INITIAL_LOCK(l)
+#endif /* USE_LOCKS */
+
+#if USE_LOCKS
+#define ACQUIRE_MALLOC_GLOBAL_LOCK()  ACQUIRE_LOCK(&malloc_global_mutex);
+#define RELEASE_MALLOC_GLOBAL_LOCK()  RELEASE_LOCK(&malloc_global_mutex);
+#else  /* USE_LOCKS */
+#define ACQUIRE_MALLOC_GLOBAL_LOCK()
+#define RELEASE_MALLOC_GLOBAL_LOCK()
+#endif /* USE_LOCKS */
+
+
+/* -----------------------  Chunk representations ------------------------ */
+
+/*
+  (The following includes lightly edited explanations by Colin Plumb.)
+
+  The malloc_chunk declaration below is misleading (but accurate and
+  necessary).  It declares a "view" into memory allowing access to
+  necessary fields at known offsets from a given base.
+
+  Chunks of memory are maintained using a `boundary tag' method as
+  originally described by Knuth.  (See the paper by Paul Wilson
+  ftp://ftp.cs.utexas.edu/pub/garbage/allocsrv.ps for a survey of such
+  techniques.)  Sizes of free chunks are stored both in the front of
+  each chunk and at the end.  This makes consolidating fragmented
+  chunks into bigger chunks fast.  The head fields also hold bits
+  representing whether chunks are free or in use.
+
+  Here are some pictures to make it clearer.  They are "exploded" to
+  show that the state of a chunk can be thought of as extending from
+  the high 31 bits of the head field of its header through the
+  prev_foot and PINUSE_BIT bit of the following chunk header.
+
+  A chunk that's in use looks like:
+
+   chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	   | Size of previous chunk (if P = 0)                             |
+	   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |P|
+	 | Size of this chunk                                         1| +-+
+   mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	 |                                                               |
+	 +-                                                             -+
+	 |                                                               |
+	 +-                                                             -+
+	 |                                                               :
+	 +-      size - sizeof(size_t) available payload bytes          -+
+	 :                                                               |
+ chunk-> +-                                                             -+
+	 |                                                               |
+	 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |1|
+       | Size of next chunk (may or may not be in use)               | +-+
+ mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+    And if it's free, it looks like this:
+
+   chunk-> +-                                                             -+
+	   | User payload (must be in use, or we would have merged!)       |
+	   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |P|
+	 | Size of this chunk                                         0| +-+
+   mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	 | Next pointer                                                  |
+	 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	 | Prev pointer                                                  |
+	 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	 |                                                               :
+	 +-      size - sizeof(struct chunk) unused bytes               -+
+	 :                                                               |
+ chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	 | Size of this chunk                                            |
+	 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |0|
+       | Size of next chunk (must be in use, or we would have merged)| +-+
+ mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+       |                                                               :
+       +- User payload                                                -+
+       :                                                               |
+       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+								     |0|
+								     +-+
+  Note that since we always merge adjacent free chunks, the chunks
+  adjacent to a free chunk must be in use.
+
+  Given a pointer to a chunk (which can be derived trivially from the
+  payload pointer) we can, in O(1) time, find out whether the adjacent
+  chunks are free, and if so, unlink them from the lists that they
+  are on and merge them with the current chunk.
+
+  Chunks always begin on even word boundaries, so the mem portion
+  (which is returned to the user) is also on an even word boundary, and
+  thus at least double-word aligned.
+
+  The P (PINUSE_BIT) bit, stored in the unused low-order bit of the
+  chunk size (which is always a multiple of two words), is an in-use
+  bit for the *previous* chunk.  If that bit is *clear*, then the
+  word before the current chunk size contains the previous chunk
+  size, and can be used to find the front of the previous chunk.
+  The very first chunk allocated always has this bit set, preventing
+  access to non-existent (or non-owned) memory. If pinuse is set for
+  any given chunk, then you CANNOT determine the size of the
+  previous chunk, and might even get a memory addressing fault when
+  trying to do so.
+
+  The C (CINUSE_BIT) bit, stored in the unused second-lowest bit of
+  the chunk size redundantly records whether the current chunk is
+  inuse. This redundancy enables usage checks within free and realloc,
+  and reduces indirection when freeing and consolidating chunks.
+
+  Each freshly allocated chunk must have both cinuse and pinuse set.
+  That is, each allocated chunk borders either a previously allocated
+  and still in-use chunk, or the base of its memory arena. This is
+  ensured by making all allocations from the `lowest' part of any
+  found chunk.  Further, no free chunk physically borders another one,
+  so each free chunk is known to be preceded and followed by either
+  inuse chunks or the ends of memory.
+
+  Note that the `foot' of the current chunk is actually represented
+  as the prev_foot of the NEXT chunk. This makes it easier to
+  deal with alignments etc but can be very confusing when trying
+  to extend or adapt this code.
+
+  The exceptions to all this are
+
+     1. The special chunk `top' is the top-most available chunk (i.e.,
+	the one bordering the end of available memory). It is treated
+	specially.  Top is never included in any bin, is used only if
+	no other chunk is available, and is released back to the
+	system if it is very large (see M_TRIM_THRESHOLD).  In effect,
+	the top chunk is treated as larger (and thus less well
+	fitting) than any other available chunk.  The top chunk
+	doesn't update its trailing size field since there is no next
+	contiguous chunk that would have to index off it. However,
+	space is still allocated for it (TOP_FOOT_SIZE) to enable
+	separation or merging when space is extended.
+
+     3. Chunks allocated via mmap, which have the lowest-order bit
+	(IS_MMAPPED_BIT) set in their prev_foot fields, and do not set
+	PINUSE_BIT in their head fields.  Because they are allocated
+	one-by-one, each must carry its own prev_foot field, which is
+	also used to hold the offset this chunk has within its mmapped
+	region, which is needed to preserve alignment. Each mmapped
+	chunk is trailed by the first two fields of a fake next-chunk
+	for sake of usage checks.
+
+*/
+
+struct malloc_chunk {
+  size_t               prev_foot;  /* Size of previous chunk (if free).  */
+  size_t               head;       /* Size and inuse bits. */
+  struct malloc_chunk* fd;         /* double links -- used only if free. */
+  struct malloc_chunk* bk;
+};
+
+typedef struct malloc_chunk  mchunk;
+typedef struct malloc_chunk* mchunkptr;
+typedef struct malloc_chunk* sbinptr;  /* The type of bins of chunks */
+typedef unsigned int bindex_t;         /* Described below */
+typedef unsigned int binmap_t;         /* Described below */
+typedef unsigned int flag_t;           /* The type of various bit flag sets */
+
+/* ------------------- Chunks sizes and alignments ----------------------- */
+
+#define MCHUNK_SIZE         (sizeof(mchunk))
+
+#if FOOTERS
+#define CHUNK_OVERHEAD      (TWO_SIZE_T_SIZES)
+#else /* FOOTERS */
+#define CHUNK_OVERHEAD      (SIZE_T_SIZE)
+#endif /* FOOTERS */
+
+/* MMapped chunks need a second word of overhead ... */
+#define MMAP_CHUNK_OVERHEAD (TWO_SIZE_T_SIZES)
+/* ... and additional padding for fake next-chunk at foot */
+#define MMAP_FOOT_PAD       (FOUR_SIZE_T_SIZES)
+
+/* The smallest size we can malloc is an aligned minimal chunk */
+#define MIN_CHUNK_SIZE\
+  ((MCHUNK_SIZE + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK)
+
+/* conversion from malloc headers to user pointers, and back */
+#define chunk2mem(p)        ((void*)((char*)(p)       + TWO_SIZE_T_SIZES))
+#define mem2chunk(mem)      ((mchunkptr)((char*)(mem) - TWO_SIZE_T_SIZES))
+/* chunk associated with aligned address A */
+#define align_as_chunk(A)   (mchunkptr)((A) + align_offset(chunk2mem(A)))
+
+/* Bounds on request (not chunk) sizes. */
+#define MAX_REQUEST         ((-MIN_CHUNK_SIZE) << 2)
+#define MIN_REQUEST         (MIN_CHUNK_SIZE - CHUNK_OVERHEAD - SIZE_T_ONE)
+
+/* pad request bytes into a usable size */
+#define pad_request(req) \
+   (((req) + CHUNK_OVERHEAD + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK)
+
+/* pad request, checking for minimum (but not maximum) */
+#define request2size(req) \
+  (((req) < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(req))
+
+
+/* ------------------ Operations on head and foot fields ----------------- */
+
+/*
+  The head field of a chunk is or'ed with PINUSE_BIT when previous
+  adjacent chunk in use, and or'ed with CINUSE_BIT if this chunk is in
+  use. If the chunk was obtained with mmap, the prev_foot field has
+  IS_MMAPPED_BIT set, otherwise holding the offset of the base of the
+  mmapped region to the base of the chunk.
+
+  FLAG4_BIT is not used by this malloc, but might be useful in extensions.
+*/
+
+#define PINUSE_BIT          (SIZE_T_ONE)
+#define CINUSE_BIT          (SIZE_T_TWO)
+#define FLAG4_BIT           (SIZE_T_FOUR)
+#define INUSE_BITS          (PINUSE_BIT|CINUSE_BIT)
+#define FLAG_BITS           (PINUSE_BIT|CINUSE_BIT|FLAG4_BIT)
+
+/* Head value for fenceposts */
+#define FENCEPOST_HEAD      (INUSE_BITS|SIZE_T_SIZE)
+
+/* extraction of fields from head words */
+#define cinuse(p)           ((p)->head & CINUSE_BIT)
+#define pinuse(p)           ((p)->head & PINUSE_BIT)
+#define chunksize(p)        ((p)->head & ~(FLAG_BITS))
+
+#define clear_pinuse(p)     ((p)->head &= ~PINUSE_BIT)
+#define clear_cinuse(p)     ((p)->head &= ~CINUSE_BIT)
+
+/* Treat space at ptr +/- offset as a chunk */
+#define chunk_plus_offset(p, s)  ((mchunkptr)(((char*)(p)) + (s)))
+#define chunk_minus_offset(p, s) ((mchunkptr)(((char*)(p)) - (s)))
+
+/* Ptr to next or previous physical malloc_chunk. */
+#define next_chunk(p) ((mchunkptr)( ((char*)(p)) + ((p)->head & ~FLAG_BITS)))
+#define prev_chunk(p) ((mchunkptr)( ((char*)(p)) - ((p)->prev_foot) ))
+
+/* extract next chunk's pinuse bit */
+#define next_pinuse(p)  ((next_chunk(p)->head) & PINUSE_BIT)
+
+/* Get/set size at footer */
+#define get_foot(p, s)  (((mchunkptr)((char*)(p) + (s)))->prev_foot)
+#define set_foot(p, s)  (((mchunkptr)((char*)(p) + (s)))->prev_foot = (s))
+
+/* Set size, pinuse bit, and foot */
+#define set_size_and_pinuse_of_free_chunk(p, s)\
+  ((p)->head = (s|PINUSE_BIT), set_foot(p, s))
+
+/* Set size, pinuse bit, foot, and clear next pinuse */
+#define set_free_with_pinuse(p, s, n)\
+  (clear_pinuse(n), set_size_and_pinuse_of_free_chunk(p, s))
+
+#define is_mmapped(p)\
+  (!((p)->head & PINUSE_BIT) && ((p)->prev_foot & IS_MMAPPED_BIT))
+
+/* Get the internal overhead associated with chunk p */
+#define overhead_for(p)\
+ (is_mmapped(p)? MMAP_CHUNK_OVERHEAD : CHUNK_OVERHEAD)
+
+/* Return true if malloced space is not necessarily cleared */
+#if MMAP_CLEARS
+#define calloc_must_clear(p) (!is_mmapped(p))
+#else /* MMAP_CLEARS */
+#define calloc_must_clear(p) (1)
+#endif /* MMAP_CLEARS */
+
+/* ---------------------- Overlaid data structures ----------------------- */
+
+/*
+  When chunks are not in use, they are treated as nodes of either
+  lists or trees.
+
+  "Small"  chunks are stored in circular doubly-linked lists, and look
+  like this:
+
+    chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	    |             Size of previous chunk                            |
+	    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    `head:' |             Size of chunk, in bytes                         |P|
+      mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	    |             Forward pointer to next chunk in list             |
+	    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	    |             Back pointer to previous chunk in list            |
+	    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	    |             Unused space (may be 0 bytes long)                .
+	    .                                                               .
+	    .                                                               |
+nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    `foot:' |             Size of chunk, in bytes                           |
+	    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+  Larger chunks are kept in a form of bitwise digital trees (aka
+  tries) keyed on chunksizes.  Because malloc_tree_chunks are only for
+  free chunks greater than 256 bytes, their size doesn't impose any
+  constraints on user chunk sizes.  Each node looks like:
+
+    chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	    |             Size of previous chunk                            |
+	    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    `head:' |             Size of chunk, in bytes                         |P|
+      mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	    |             Forward pointer to next chunk of same size        |
+	    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	    |             Back pointer to previous chunk of same size       |
+	    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	    |             Pointer to left child (child[0])                  |
+	    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	    |             Pointer to right child (child[1])                 |
+	    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	    |             Pointer to parent                                 |
+	    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	    |             bin index of this chunk                           |
+	    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+	    |             Unused space                                      .
+	    .                                                               |
+nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+    `foot:' |             Size of chunk, in bytes                           |
+	    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+  Each tree holding treenodes is a tree of unique chunk sizes.  Chunks
+  of the same size are arranged in a circularly-linked list, with only
+  the oldest chunk (the next to be used, in our FIFO ordering)
+  actually in the tree.  (Tree members are distinguished by a non-null
+  parent pointer.)  If a chunk with the same size as an existing node
+  is inserted, it is linked off the existing node using pointers that
+  work in the same way as fd/bk pointers of small chunks.
+
+  Each tree contains a power of 2 sized range of chunk sizes (the
+  smallest is 0x100 <= x < 0x180), which is divided in half at each
+  tree level, with the chunks in the smaller half of the range (0x100
+  <= x < 0x140 for the top nose) in the left subtree and the larger
+  half (0x140 <= x < 0x180) in the right subtree.  This is, of course,
+  done by inspecting individual bits.
+
+  Using these rules, each node's left subtree contains all smaller
+  sizes than its right subtree.  However, the node at the root of each
+  subtree has no particular ordering relationship to either.  (The
+  dividing line between the subtree sizes is based on trie relation.)
+  If we remove the last chunk of a given size from the interior of the
+  tree, we need to replace it with a leaf node.  The tree ordering
+  rules permit a node to be replaced by any leaf below it.
+
+  The smallest chunk in a tree (a common operation in a best-fit
+  allocator) can be found by walking a path to the leftmost leaf in
+  the tree.  Unlike a usual binary tree, where we follow left child
+  pointers until we reach a null, here we follow the right child
+  pointer any time the left one is null, until we reach a leaf with
+  both child pointers null. The smallest chunk in the tree will be
+  somewhere along that path.
+
+  The worst case number of steps to add, find, or remove a node is
+  bounded by the number of bits differentiating chunks within
+  bins. Under current bin calculations, this ranges from 6 up to 21
+  (for 32 bit sizes) or up to 53 (for 64 bit sizes). The typical case
+  is of course much better.
+*/
+
+struct malloc_tree_chunk {
+  /* The first four fields must be compatible with malloc_chunk */
+  size_t                    prev_foot;
+  size_t                    head;
+  struct malloc_tree_chunk* fd;
+  struct malloc_tree_chunk* bk;
+
+  struct malloc_tree_chunk* child[2];
+  struct malloc_tree_chunk* parent;
+  bindex_t                  index;
+};
+
+typedef struct malloc_tree_chunk  tchunk;
+typedef struct malloc_tree_chunk* tchunkptr;
+typedef struct malloc_tree_chunk* tbinptr; /* The type of bins of trees */
+
+/* A little helper macro for trees */
+#define leftmost_child(t) ((t)->child[0] != 0? (t)->child[0] : (t)->child[1])
+
+/* ----------------------------- Segments -------------------------------- */
+
+/*
+  Each malloc space may include non-contiguous segments, held in a
+  list headed by an embedded malloc_segment record representing the
+  top-most space. Segments also include flags holding properties of
+  the space. Large chunks that are directly allocated by mmap are not
+  included in this list. They are instead independently created and
+  destroyed without otherwise keeping track of them.
+
+  Segment management mainly comes into play for spaces allocated by
+  MMAP.  Any call to MMAP might or might not return memory that is
+  adjacent to an existing segment.  MORECORE normally contiguously
+  extends the current space, so this space is almost always adjacent,
+  which is simpler and faster to deal with. (This is why MORECORE is
+  used preferentially to MMAP when both are available -- see
+  sys_alloc.)  When allocating using MMAP, we don't use any of the
+  hinting mechanisms (inconsistently) supported in various
+  implementations of unix mmap, or distinguish reserving from
+  committing memory. Instead, we just ask for space, and exploit
+  contiguity when we get it.  It is probably possible to do
+  better than this on some systems, but no general scheme seems
+  to be significantly better.
+
+  Management entails a simpler variant of the consolidation scheme
+  used for chunks to reduce fragmentation -- new adjacent memory is
+  normally prepended or appended to an existing segment. However,
+  there are limitations compared to chunk consolidation that mostly
+  reflect the fact that segment processing is relatively infrequent
+  (occurring only when getting memory from system) and that we
+  don't expect to have huge numbers of segments:
+
+  * Segments are not indexed, so traversal requires linear scans.  (It
+    would be possible to index these, but is not worth the extra
+    overhead and complexity for most programs on most platforms.)
+  * New segments are only appended to old ones when holding top-most
+    memory; if they cannot be prepended to others, they are held in
+    different segments.
+
+  Except for the top-most segment of an mstate, each segment record
+  is kept at the tail of its segment. Segments are added by pushing
+  segment records onto the list headed by &mstate.seg for the
+  containing mstate.
+
+  Segment flags control allocation/merge/deallocation policies:
+  * If EXTERN_BIT set, then we did not allocate this segment,
+    and so should not try to deallocate or merge with others.
+    (This currently holds only for the initial segment passed
+    into create_mspace_with_base.)
+  * If IS_MMAPPED_BIT set, the segment may be merged with
+    other surrounding mmapped segments and trimmed/de-allocated
+    using munmap.
+  * If neither bit is set, then the segment was obtained using
+    MORECORE so can be merged with surrounding MORECORE'd segments
+    and deallocated/trimmed using MORECORE with negative arguments.
+*/
+
+struct malloc_segment {
+  char*        base;             /* base address */
+  size_t       size;             /* allocated size */
+  struct malloc_segment* next;   /* ptr to next segment */
+  flag_t       sflags;           /* mmap and extern flag */
+};
+
+#define is_mmapped_segment(S)  ((S)->sflags & IS_MMAPPED_BIT)
+#define is_extern_segment(S)   ((S)->sflags & EXTERN_BIT)
+
+typedef struct malloc_segment  msegment;
+typedef struct malloc_segment* msegmentptr;
+
+/* ---------------------------- malloc_state ----------------------------- */
+
+/*
+   A malloc_state holds all of the bookkeeping for a space.
+   The main fields are:
+
+  Top
+    The topmost chunk of the currently active segment. Its size is
+    cached in topsize.  The actual size of topmost space is
+    topsize+TOP_FOOT_SIZE, which includes space reserved for adding
+    fenceposts and segment records if necessary when getting more
+    space from the system.  The size at which to autotrim top is
+    cached from mparams in trim_check, except that it is disabled if
+    an autotrim fails.
+
+  Designated victim (dv)
+    This is the preferred chunk for servicing small requests that
+    don't have exact fits.  It is normally the chunk split off most
+    recently to service another small request.  Its size is cached in
+    dvsize. The link fields of this chunk are not maintained since it
+    is not kept in a bin.
+
+  SmallBins
+    An array of bin headers for free chunks.  These bins hold chunks
+    with sizes less than MIN_LARGE_SIZE bytes. Each bin contains
+    chunks of all the same size, spaced 8 bytes apart.  To simplify
+    use in double-linked lists, each bin header acts as a malloc_chunk
+    pointing to the real first node, if it exists (else pointing to
+    itself).  This avoids special-casing for headers.  But to avoid
+    waste, we allocate only the fd/bk pointers of bins, and then use
+    repositioning tricks to treat these as the fields of a chunk.
+
+  TreeBins
+    Treebins are pointers to the roots of trees holding a range of
+    sizes. There are 2 equally spaced treebins for each power of two
+    from TREE_SHIFT to TREE_SHIFT+16. The last bin holds anything
+    larger.
+
+  Bin maps
+    There is one bit map for small bins ("smallmap") and one for
+    treebins ("treemap).  Each bin sets its bit when non-empty, and
+    clears the bit when empty.  Bit operations are then used to avoid
+    bin-by-bin searching -- nearly all "search" is done without ever
+    looking at bins that won't be selected.  The bit maps
+    conservatively use 32 bits per map word, even if on 64bit system.
+    For a good description of some of the bit-based techniques used
+    here, see Henry S. Warren Jr's book "Hacker's Delight" (and
+    supplement at http://hackersdelight.org/). Many of these are
+    intended to reduce the branchiness of paths through malloc etc, as
+    well as to reduce the number of memory locations read or written.
+
+  Segments
+    A list of segments headed by an embedded malloc_segment record
+    representing the initial space.
+
+  Address check support
+    The least_addr field is the least address ever obtained from
+    MORECORE or MMAP. Attempted frees and reallocs of any address less
+    than this are trapped (unless INSECURE is defined).
+
+  Magic tag
+    A cross-check field that should always hold same value as mparams.magic.
+
+  Flags
+    Bits recording whether to use MMAP, locks, or contiguous MORECORE
+
+  Statistics
+    Each space keeps track of current and maximum system memory
+    obtained via MORECORE or MMAP.
+
+  Trim support
+    Fields holding the amount of unused topmost memory that should trigger
+    timing, and a counter to force periodic scanning to release unused
+    non-topmost segments.
+
+  Locking
+    If USE_LOCKS is defined, the "mutex" lock is acquired and released
+    around every public call using this mspace.
+
+  Extension support
+    A void* pointer and a size_t field that can be used to help implement
+    extensions to this malloc.
+*/
+
+/* Bin types, widths and sizes */
+#define NSMALLBINS        (32U)
+#define NTREEBINS         (32U)
+#define SMALLBIN_SHIFT    (3U)
+#define SMALLBIN_WIDTH    (SIZE_T_ONE << SMALLBIN_SHIFT)
+#define TREEBIN_SHIFT     (8U)
+#define MIN_LARGE_SIZE    (SIZE_T_ONE << TREEBIN_SHIFT)
+#define MAX_SMALL_SIZE    (MIN_LARGE_SIZE - SIZE_T_ONE)
+#define MAX_SMALL_REQUEST (MAX_SMALL_SIZE - CHUNK_ALIGN_MASK - CHUNK_OVERHEAD)
+
+struct malloc_state {
+  binmap_t   smallmap;
+  binmap_t   treemap;
+  size_t     dvsize;
+  size_t     topsize;
+  char*      least_addr;
+  mchunkptr  dv;
+  mchunkptr  top;
+  size_t     trim_check;
+  size_t     release_checks;
+  size_t     magic;
+  mchunkptr  smallbins[(NSMALLBINS+1)*2];
+  tbinptr    treebins[NTREEBINS];
+  size_t     footprint;
+  size_t     max_footprint;
+  flag_t     mflags;
+#if USE_LOCKS
+  MLOCK_T    mutex;     /* locate lock among fields that rarely change */
+#endif /* USE_LOCKS */
+  msegment   seg;
+  void*      extp;      /* Unused but available for extensions */
+  size_t     exts;
+};
+
+typedef struct malloc_state*    mstate;
+
+/* ------------- Global malloc_state and malloc_params ------------------- */
+
+/*
+  malloc_params holds global properties, including those that can be
+  dynamically set using mallopt. There is a single instance, mparams,
+  initialized in init_mparams. Note that the non-zeroness of "magic"
+  also serves as an initialization flag.
+*/
+
+struct malloc_params {
+  volatile size_t magic;
+  size_t page_size;
+  size_t granularity;
+  size_t mmap_threshold;
+  size_t trim_threshold;
+  flag_t default_mflags;
+};
+
+static struct malloc_params mparams;
+
+/* Ensure mparams initialized */
+#define ensure_initialization() ((void)(mparams.magic != 0 || init_mparams()))
+
+#if !ONLY_MSPACES
+
+/* The global malloc_state used for all non-"mspace" calls */
+static struct malloc_state _gm_;
+#define gm                 (&_gm_)
+#define is_global(M)       ((M) == &_gm_)
+
+#endif /* !ONLY_MSPACES */
+
+#define is_initialized(M)  ((M)->top != 0)
+
+/* -------------------------- system alloc setup ------------------------- */
+
+/* Operations on mflags */
+
+#define use_lock(M)           ((M)->mflags &   USE_LOCK_BIT)
+#define enable_lock(M)        ((M)->mflags |=  USE_LOCK_BIT)
+#define disable_lock(M)       ((M)->mflags &= ~USE_LOCK_BIT)
+
+#define use_mmap(M)           ((M)->mflags &   USE_MMAP_BIT)
+#define enable_mmap(M)        ((M)->mflags |=  USE_MMAP_BIT)
+#define disable_mmap(M)       ((M)->mflags &= ~USE_MMAP_BIT)
+
+#define use_noncontiguous(M)  ((M)->mflags &   USE_NONCONTIGUOUS_BIT)
+#define disable_contiguous(M) ((M)->mflags |=  USE_NONCONTIGUOUS_BIT)
+
+#define set_lock(M,L)\
+ ((M)->mflags = (L)?\
+  ((M)->mflags | USE_LOCK_BIT) :\
+  ((M)->mflags & ~USE_LOCK_BIT))
+
+/* page-align a size */
+#define page_align(S)\
+ (((S) + (mparams.page_size - SIZE_T_ONE)) & ~(mparams.page_size - SIZE_T_ONE))
+
+/* granularity-align a size */
+#define granularity_align(S)\
+  (((S) + (mparams.granularity - SIZE_T_ONE))\
+   & ~(mparams.granularity - SIZE_T_ONE))
+
+
+/* For mmap, use granularity alignment on windows, else page-align */
+#ifdef WIN32
+#define mmap_align(S) granularity_align(S)
+#else
+#define mmap_align(S) page_align(S)
+#endif
+
+/* For sys_alloc, enough padding to ensure can malloc request on success */
+#define SYS_ALLOC_PADDING (TOP_FOOT_SIZE + MALLOC_ALIGNMENT)
+
+#define is_page_aligned(S)\
+   (((size_t)(S) & (mparams.page_size - SIZE_T_ONE)) == 0)
+#define is_granularity_aligned(S)\
+   (((size_t)(S) & (mparams.granularity - SIZE_T_ONE)) == 0)
+
+/*  True if segment S holds address A */
+#define segment_holds(S, A)\
+  ((char*)(A) >= S->base && (char*)(A) < S->base + S->size)
+
+/* Return segment holding given address */
+static msegmentptr segment_holding(mstate m, char* addr) {
+  msegmentptr sp = &m->seg;
+  for (;;) {
+    if (addr >= sp->base && addr < sp->base + sp->size)
+      return sp;
+    if ((sp = sp->next) == 0)
+      return 0;
+  }
+}
+
+/* Return true if segment contains a segment link */
+static int has_segment_link(mstate m, msegmentptr ss) {
+  msegmentptr sp = &m->seg;
+  for (;;) {
+    if ((char*)sp >= ss->base && (char*)sp < ss->base + ss->size)
+      return 1;
+    if ((sp = sp->next) == 0)
+      return 0;
+  }
+}
+
+#ifndef MORECORE_CANNOT_TRIM
+#define should_trim(M,s)  ((s) > (M)->trim_check)
+#else  /* MORECORE_CANNOT_TRIM */
+#define should_trim(M,s)  (0)
+#endif /* MORECORE_CANNOT_TRIM */
+
+/*
+  TOP_FOOT_SIZE is padding at the end of a segment, including space
+  that may be needed to place segment records and fenceposts when new
+  noncontiguous segments are added.
+*/
+#define TOP_FOOT_SIZE\
+  (align_offset(chunk2mem(0))+pad_request(sizeof(struct malloc_segment))+MIN_CHUNK_SIZE)
+
+
+/* -------------------------------  Hooks -------------------------------- */
+
+/*
+  PREACTION should be defined to return 0 on success, and nonzero on
+  failure. If you are not using locking, you can redefine these to do
+  anything you like.
+*/
+
+#if USE_LOCKS
+
+#define PREACTION(M)  ((use_lock(M))? ACQUIRE_LOCK(&(M)->mutex) : 0)
+#define POSTACTION(M) { if (use_lock(M)) RELEASE_LOCK(&(M)->mutex); }
+#else /* USE_LOCKS */
+
+#ifndef PREACTION
+#define PREACTION(M) (0)
+#endif  /* PREACTION */
+
+#ifndef POSTACTION
+#define POSTACTION(M)
+#endif  /* POSTACTION */
+
+#endif /* USE_LOCKS */
+
+/*
+  CORRUPTION_ERROR_ACTION is triggered upon detected bad addresses.
+  USAGE_ERROR_ACTION is triggered on detected bad frees and
+  reallocs. The argument p is an address that might have triggered the
+  fault. It is ignored by the two predefined actions, but might be
+  useful in custom actions that try to help diagnose errors.
+*/
+
+#if PROCEED_ON_ERROR
+
+/* A count of the number of corruption errors causing resets */
+int malloc_corruption_error_count;
+
+/* default corruption action */
+static void reset_on_error(mstate m);
+
+#define CORRUPTION_ERROR_ACTION(m)  reset_on_error(m)
+#define USAGE_ERROR_ACTION(m, p)
+
+#else /* PROCEED_ON_ERROR */
+
+#ifndef CORRUPTION_ERROR_ACTION
+#define CORRUPTION_ERROR_ACTION(m) ABORT
+#endif /* CORRUPTION_ERROR_ACTION */
+
+#ifndef USAGE_ERROR_ACTION
+#define USAGE_ERROR_ACTION(m,p) ABORT
+#endif /* USAGE_ERROR_ACTION */
+
+#endif /* PROCEED_ON_ERROR */
+
+/* -------------------------- Debugging setup ---------------------------- */
+
+#if ! DEBUG
+
+#define check_free_chunk(M,P)
+#define check_inuse_chunk(M,P)
+#define check_malloced_chunk(M,P,N)
+#define check_mmapped_chunk(M,P)
+#define check_malloc_state(M)
+#define check_top_chunk(M,P)
+
+#else /* DEBUG */
+#define check_free_chunk(M,P)       do_check_free_chunk(M,P)
+#define check_inuse_chunk(M,P)      do_check_inuse_chunk(M,P)
+#define check_top_chunk(M,P)        do_check_top_chunk(M,P)
+#define check_malloced_chunk(M,P,N) do_check_malloced_chunk(M,P,N)
+#define check_mmapped_chunk(M,P)    do_check_mmapped_chunk(M,P)
+#define check_malloc_state(M)       do_check_malloc_state(M)
+
+static void   do_check_any_chunk(mstate m, mchunkptr p);
+static void   do_check_top_chunk(mstate m, mchunkptr p);
+static void   do_check_mmapped_chunk(mstate m, mchunkptr p);
+static void   do_check_inuse_chunk(mstate m, mchunkptr p);
+static void   do_check_free_chunk(mstate m, mchunkptr p);
+static void   do_check_malloced_chunk(mstate m, void* mem, size_t s);
+static void   do_check_tree(mstate m, tchunkptr t);
+static void   do_check_treebin(mstate m, bindex_t i);
+static void   do_check_smallbin(mstate m, bindex_t i);
+static void   do_check_malloc_state(mstate m);
+static int    bin_find(mstate m, mchunkptr x);
+static size_t traverse_and_check(mstate m);
+#endif /* DEBUG */
+
+/* ---------------------------- Indexing Bins ---------------------------- */
+
+#define is_small(s)         (((s) >> SMALLBIN_SHIFT) < NSMALLBINS)
+#define small_index(s)      ((s)  >> SMALLBIN_SHIFT)
+#define small_index2size(i) ((i)  << SMALLBIN_SHIFT)
+#define MIN_SMALL_INDEX     (small_index(MIN_CHUNK_SIZE))
+
+/* addressing by index. See above about smallbin repositioning */
+#define smallbin_at(M, i)   ((sbinptr)((char*)&((M)->smallbins[(i)<<1])))
+#define treebin_at(M,i)     (&((M)->treebins[i]))
+
+/* assign tree index for size S to variable I. Use x86 asm if possible  */
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+#define compute_tree_index(S, I)\
+{\
+  unsigned int X = S >> TREEBIN_SHIFT;\
+  if (X == 0)\
+    I = 0;\
+  else if (X > 0xFFFF)\
+    I = NTREEBINS-1;\
+  else {\
+    unsigned int K;\
+    __asm__("bsrl\t%1, %0\n\t" : "=r" (K) : "rm"  (X));\
+    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
+  }\
+}
+
+#elif defined (__INTEL_COMPILER)
+#define compute_tree_index(S, I)\
+{\
+  size_t X = S >> TREEBIN_SHIFT;\
+  if (X == 0)\
+    I = 0;\
+  else if (X > 0xFFFF)\
+    I = NTREEBINS-1;\
+  else {\
+    unsigned int K = _bit_scan_reverse (X); \
+    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
+  }\
+}
+
+#elif defined(_MSC_VER) && _MSC_VER>=1300
+#define compute_tree_index(S, I)\
+{\
+  size_t X = S >> TREEBIN_SHIFT;\
+  if (X == 0)\
+    I = 0;\
+  else if (X > 0xFFFF)\
+    I = NTREEBINS-1;\
+  else {\
+    unsigned int K;\
+    _BitScanReverse((DWORD *) &K, X);\
+    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
+  }\
+}
+
+#else /* GNUC */
+#define compute_tree_index(S, I)\
+{\
+  size_t X = S >> TREEBIN_SHIFT;\
+  if (X == 0)\
+    I = 0;\
+  else if (X > 0xFFFF)\
+    I = NTREEBINS-1;\
+  else {\
+    unsigned int Y = (unsigned int)X;\
+    unsigned int N = ((Y - 0x100) >> 16) & 8;\
+    unsigned int K = (((Y <<= N) - 0x1000) >> 16) & 4;\
+    N += K;\
+    N += K = (((Y <<= K) - 0x4000) >> 16) & 2;\
+    K = 14 - N + ((Y <<= K) >> 15);\
+    I = (K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1));\
+  }\
+}
+#endif /* GNUC */
+
+/* Bit representing maximum resolved size in a treebin at i */
+#define bit_for_tree_index(i) \
+   (i == NTREEBINS-1)? (SIZE_T_BITSIZE-1) : (((i) >> 1) + TREEBIN_SHIFT - 2)
+
+/* Shift placing maximum resolved bit in a treebin at i as sign bit */
+#define leftshift_for_tree_index(i) \
+   ((i == NTREEBINS-1)? 0 : \
+    ((SIZE_T_BITSIZE-SIZE_T_ONE) - (((i) >> 1) + TREEBIN_SHIFT - 2)))
+
+/* The size of the smallest chunk held in bin with index i */
+#define minsize_for_tree_index(i) \
+   ((SIZE_T_ONE << (((i) >> 1) + TREEBIN_SHIFT)) |  \
+   (((size_t)((i) & SIZE_T_ONE)) << (((i) >> 1) + TREEBIN_SHIFT - 1)))
+
+
+/* ------------------------ Operations on bin maps ----------------------- */
+
+/* bit corresponding to given index */
+#define idx2bit(i)              ((binmap_t)(1) << (i))
+
+/* Mark/Clear bits with given index */
+#define mark_smallmap(M,i)      ((M)->smallmap |=  idx2bit(i))
+#define clear_smallmap(M,i)     ((M)->smallmap &= ~idx2bit(i))
+#define smallmap_is_marked(M,i) ((M)->smallmap &   idx2bit(i))
+
+#define mark_treemap(M,i)       ((M)->treemap  |=  idx2bit(i))
+#define clear_treemap(M,i)      ((M)->treemap  &= ~idx2bit(i))
+#define treemap_is_marked(M,i)  ((M)->treemap  &   idx2bit(i))
+
+/* isolate the least set bit of a bitmap */
+#define least_bit(x)         ((x) & -(x))
+
+/* mask with all bits to left of least bit of x on */
+#define left_bits(x)         ((x<<1) | -(x<<1))
+
+/* mask with all bits to left of or equal to least bit of x on */
+#define same_or_left_bits(x) ((x) | -(x))
+
+/* index corresponding to given bit. Use x86 asm if possible */
+
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+#define compute_bit2idx(X, I)\
+{\
+  unsigned int J;\
+  __asm__("bsfl\t%1, %0\n\t" : "=r" (J) : "rm" (X));\
+  I = (bindex_t)J;\
+}
+
+#elif defined (__INTEL_COMPILER)
+#define compute_bit2idx(X, I)\
+{\
+  unsigned int J;\
+  J = _bit_scan_forward (X); \
+  I = (bindex_t)J;\
+}
+
+#elif defined(_MSC_VER) && _MSC_VER>=1300
+#define compute_bit2idx(X, I)\
+{\
+  unsigned int J;\
+  _BitScanForward((DWORD *) &J, X);\
+  I = (bindex_t)J;\
+}
+
+#elif USE_BUILTIN_FFS
+#define compute_bit2idx(X, I) I = ffs(X)-1
+
+#else
+#define compute_bit2idx(X, I)\
+{\
+  unsigned int Y = X - 1;\
+  unsigned int K = Y >> (16-4) & 16;\
+  unsigned int N = K;        Y >>= K;\
+  N += K = Y >> (8-3) &  8;  Y >>= K;\
+  N += K = Y >> (4-2) &  4;  Y >>= K;\
+  N += K = Y >> (2-1) &  2;  Y >>= K;\
+  N += K = Y >> (1-0) &  1;  Y >>= K;\
+  I = (bindex_t)(N + Y);\
+}
+#endif /* GNUC */
+
+
+/* ----------------------- Runtime Check Support ------------------------- */
+
+/*
+  For security, the main invariant is that malloc/free/etc never
+  writes to a static address other than malloc_state, unless static
+  malloc_state itself has been corrupted, which cannot occur via
+  malloc (because of these checks). In essence this means that we
+  believe all pointers, sizes, maps etc held in malloc_state, but
+  check all of those linked or offsetted from other embedded data
+  structures.  These checks are interspersed with main code in a way
+  that tends to minimize their run-time cost.
+
+  When FOOTERS is defined, in addition to range checking, we also
+  verify footer fields of inuse chunks, which can be used guarantee
+  that the mstate controlling malloc/free is intact.  This is a
+  streamlined version of the approach described by William Robertson
+  et al in "Run-time Detection of Heap-based Overflows" LISA'03
+  http://www.usenix.org/events/lisa03/tech/robertson.html The footer
+  of an inuse chunk holds the xor of its mstate and a random seed,
+  that is checked upon calls to free() and realloc().  This is
+  (probablistically) unguessable from outside the program, but can be
+  computed by any code successfully malloc'ing any chunk, so does not
+  itself provide protection against code that has already broken
+  security through some other means.  Unlike Robertson et al, we
+  always dynamically check addresses of all offset chunks (previous,
+  next, etc). This turns out to be cheaper than relying on hashes.
+*/
+
+#if !INSECURE
+/* Check if address a is at least as high as any from MORECORE or MMAP */
+#define ok_address(M, a) ((char*)(a) >= (M)->least_addr)
+/* Check if address of next chunk n is higher than base chunk p */
+#define ok_next(p, n)    ((char*)(p) < (char*)(n))
+/* Check if p has its cinuse bit on */
+#define ok_cinuse(p)     cinuse(p)
+/* Check if p has its pinuse bit on */
+#define ok_pinuse(p)     pinuse(p)
+
+#else /* !INSECURE */
+#define ok_address(M, a) (1)
+#define ok_next(b, n)    (1)
+#define ok_cinuse(p)     (1)
+#define ok_pinuse(p)     (1)
+#endif /* !INSECURE */
+
+#if (FOOTERS && !INSECURE)
+/* Check if (alleged) mstate m has expected magic field */
+#define ok_magic(M)      ((M)->magic == mparams.magic)
+#else  /* (FOOTERS && !INSECURE) */
+#define ok_magic(M)      (1)
+#endif /* (FOOTERS && !INSECURE) */
+
+
+/* In gcc, use __builtin_expect to minimize impact of checks */
+#if !INSECURE
+#if defined(__GNUC__) && __GNUC__ >= 3
+#define RTCHECK(e)  __builtin_expect(e, 1)
+#else /* GNUC */
+#define RTCHECK(e)  (e)
+#endif /* GNUC */
+#else /* !INSECURE */
+#define RTCHECK(e)  (1)
+#endif /* !INSECURE */
+
+/* macros to set up inuse chunks with or without footers */
+
+#if !FOOTERS
+
+#define mark_inuse_foot(M,p,s)
+
+/* Set cinuse bit and pinuse bit of next chunk */
+#define set_inuse(M,p,s)\
+  ((p)->head = (((p)->head & PINUSE_BIT)|s|CINUSE_BIT),\
+  ((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT)
+
+/* Set cinuse and pinuse of this chunk and pinuse of next chunk */
+#define set_inuse_and_pinuse(M,p,s)\
+  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
+  ((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT)
+
+/* Set size, cinuse and pinuse bit of this chunk */
+#define set_size_and_pinuse_of_inuse_chunk(M, p, s)\
+  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT))
+
+#else /* FOOTERS */
+
+/* Set foot of inuse chunk to be xor of mstate and seed */
+#define mark_inuse_foot(M,p,s)\
+  (((mchunkptr)((char*)(p) + (s)))->prev_foot = ((size_t)(M) ^ mparams.magic))
+
+#define get_mstate_for(p)\
+  ((mstate)(((mchunkptr)((char*)(p) +\
+    (chunksize(p))))->prev_foot ^ mparams.magic))
+
+#define set_inuse(M,p,s)\
+  ((p)->head = (((p)->head & PINUSE_BIT)|s|CINUSE_BIT),\
+  (((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT), \
+  mark_inuse_foot(M,p,s))
+
+#define set_inuse_and_pinuse(M,p,s)\
+  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
+  (((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT),\
+ mark_inuse_foot(M,p,s))
+
+#define set_size_and_pinuse_of_inuse_chunk(M, p, s)\
+  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
+  mark_inuse_foot(M, p, s))
+
+#endif /* !FOOTERS */
+
+/* ---------------------------- setting mparams -------------------------- */
+
+/* Initialize mparams */
+static int init_mparams(void) {
+#ifdef NEED_GLOBAL_LOCK_INIT
+  if (malloc_global_mutex_status <= 0)
+    init_malloc_global_mutex();
+#endif
+
+  ACQUIRE_MALLOC_GLOBAL_LOCK();
+  if (mparams.magic == 0) {
+    size_t magic;
+    size_t psize;
+    size_t gsize;
+
+#ifndef WIN32
+    psize = malloc_getpagesize;
+    gsize = ((DEFAULT_GRANULARITY != 0)? DEFAULT_GRANULARITY : psize);
+#else /* WIN32 */
+    {
+      SYSTEM_INFO system_info;
+      GetSystemInfo(&system_info);
+      psize = system_info.dwPageSize;
+      gsize = ((DEFAULT_GRANULARITY != 0)?
+	       DEFAULT_GRANULARITY : system_info.dwAllocationGranularity);
+    }
+#endif /* WIN32 */
+
+    /* Sanity-check configuration:
+       size_t must be unsigned and as wide as pointer type.
+       ints must be at least 4 bytes.
+       alignment must be at least 8.
+       Alignment, min chunk size, and page size must all be powers of 2.
+    */
+    if ((sizeof(size_t) != sizeof(char*)) ||
+	(MAX_SIZE_T < MIN_CHUNK_SIZE)  ||
+	(sizeof(int) < 4)  ||
+	(MALLOC_ALIGNMENT < (size_t)8U) ||
+	((MALLOC_ALIGNMENT & (MALLOC_ALIGNMENT-SIZE_T_ONE)) != 0) ||
+	((MCHUNK_SIZE      & (MCHUNK_SIZE-SIZE_T_ONE))      != 0) ||
+	((gsize            & (gsize-SIZE_T_ONE))            != 0) ||
+	((psize            & (psize-SIZE_T_ONE))            != 0))
+      ABORT;
+
+    mparams.granularity = gsize;
+    mparams.page_size = psize;
+    mparams.mmap_threshold = DEFAULT_MMAP_THRESHOLD;
+    mparams.trim_threshold = DEFAULT_TRIM_THRESHOLD;
+#if MORECORE_CONTIGUOUS
+    mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT;
+#else  /* MORECORE_CONTIGUOUS */
+    mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT|USE_NONCONTIGUOUS_BIT;
+#endif /* MORECORE_CONTIGUOUS */
+
+#if !ONLY_MSPACES
+    /* Set up lock for main malloc area */
+    gm->mflags = mparams.default_mflags;
+    (void)INITIAL_LOCK(&gm->mutex);
+#endif
+
+#if (FOOTERS && !INSECURE)
+    {
+#if USE_DEV_RANDOM
+      int fd;
+      unsigned char buf[sizeof(size_t)];
+      /* Try to use /dev/urandom, else fall back on using time */
+      if ((fd = open("/dev/urandom", O_RDONLY)) >= 0 &&
+	  read(fd, buf, sizeof(buf)) == sizeof(buf)) {
+	magic = *((size_t *) buf);
+	close(fd);
+      }
+      else
+#endif /* USE_DEV_RANDOM */
+#ifdef WIN32
+	magic = (size_t)(GetTickCount() ^ (size_t)0x55555555U);
+#else
+      magic = (size_t)(time(0) ^ (size_t)0x55555555U);
+#endif
+      magic |= (size_t)8U;    /* ensure nonzero */
+      magic &= ~(size_t)7U;   /* improve chances of fault for bad values */
+    }
+#else /* (FOOTERS && !INSECURE) */
+    magic = (size_t)0x58585858U;
+#endif /* (FOOTERS && !INSECURE) */
+
+    mparams.magic = magic;
+  }
+
+  RELEASE_MALLOC_GLOBAL_LOCK();
+  return 1;
+}
+
+/* support for mallopt */
+static int change_mparam(int param_number, int value) {
+  size_t val = (value == -1)? MAX_SIZE_T : (size_t)value;
+  ensure_initialization();
+  switch(param_number) {
+  case M_TRIM_THRESHOLD:
+    mparams.trim_threshold = val;
+    return 1;
+  case M_GRANULARITY:
+    if (val >= mparams.page_size && ((val & (val-1)) == 0)) {
+      mparams.granularity = val;
+      return 1;
+    }
+    else
+      return 0;
+  case M_MMAP_THRESHOLD:
+    mparams.mmap_threshold = val;
+    return 1;
+  default:
+    return 0;
+  }
+}
+
+#if DEBUG
+/* ------------------------- Debugging Support --------------------------- */
+
+/* Check properties of any chunk, whether free, inuse, mmapped etc  */
+static void do_check_any_chunk(mstate m, mchunkptr p) {
+  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
+  assert(ok_address(m, p));
+}
+
+/* Check properties of top chunk */
+static void do_check_top_chunk(mstate m, mchunkptr p) {
+  msegmentptr sp = segment_holding(m, (char*)p);
+  size_t  sz = p->head & ~INUSE_BITS; /* third-lowest bit can be set! */
+  assert(sp != 0);
+  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
+  assert(ok_address(m, p));
+  assert(sz == m->topsize);
+  assert(sz > 0);
+  assert(sz == ((sp->base + sp->size) - (char*)p) - TOP_FOOT_SIZE);
+  assert(pinuse(p));
+  assert(!pinuse(chunk_plus_offset(p, sz)));
+}
+
+/* Check properties of (inuse) mmapped chunks */
+static void do_check_mmapped_chunk(mstate m, mchunkptr p) {
+  size_t  sz = chunksize(p);
+  size_t len = (sz + (p->prev_foot & ~IS_MMAPPED_BIT) + MMAP_FOOT_PAD);
+  assert(is_mmapped(p));
+  assert(use_mmap(m));
+  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
+  assert(ok_address(m, p));
+  assert(!is_small(sz));
+  assert((len & (mparams.page_size-SIZE_T_ONE)) == 0);
+  assert(chunk_plus_offset(p, sz)->head == FENCEPOST_HEAD);
+  assert(chunk_plus_offset(p, sz+SIZE_T_SIZE)->head == 0);
+}
+
+/* Check properties of inuse chunks */
+static void do_check_inuse_chunk(mstate m, mchunkptr p) {
+  do_check_any_chunk(m, p);
+  assert(cinuse(p));
+  assert(next_pinuse(p));
+  /* If not pinuse and not mmapped, previous chunk has OK offset */
+  assert(is_mmapped(p) || pinuse(p) || next_chunk(prev_chunk(p)) == p);
+  if (is_mmapped(p))
+    do_check_mmapped_chunk(m, p);
+}
+
+/* Check properties of free chunks */
+static void do_check_free_chunk(mstate m, mchunkptr p) {
+  size_t sz = chunksize(p);
+  mchunkptr next = chunk_plus_offset(p, sz);
+  do_check_any_chunk(m, p);
+  assert(!cinuse(p));
+  assert(!next_pinuse(p));
+  assert (!is_mmapped(p));
+  if (p != m->dv && p != m->top) {
+    if (sz >= MIN_CHUNK_SIZE) {
+      assert((sz & CHUNK_ALIGN_MASK) == 0);
+      assert(is_aligned(chunk2mem(p)));
+      assert(next->prev_foot == sz);
+      assert(pinuse(p));
+      assert (next == m->top || cinuse(next));
+      assert(p->fd->bk == p);
+      assert(p->bk->fd == p);
+    }
+    else  /* markers are always of size SIZE_T_SIZE */
+      assert(sz == SIZE_T_SIZE);
+  }
+}
+
+/* Check properties of malloced chunks at the point they are malloced */
+static void do_check_malloced_chunk(mstate m, void* mem, size_t s) {
+  if (mem != 0) {
+    mchunkptr p = mem2chunk(mem);
+    size_t sz = p->head & ~(PINUSE_BIT|CINUSE_BIT);
+    do_check_inuse_chunk(m, p);
+    assert((sz & CHUNK_ALIGN_MASK) == 0);
+    assert(sz >= MIN_CHUNK_SIZE);
+    assert(sz >= s);
+    /* unless mmapped, size is less than MIN_CHUNK_SIZE more than request */
+    assert(is_mmapped(p) || sz < (s + MIN_CHUNK_SIZE));
+  }
+}
+
+/* Check a tree and its subtrees.  */
+static void do_check_tree(mstate m, tchunkptr t) {
+  tchunkptr head = 0;
+  tchunkptr u = t;
+  bindex_t tindex = t->index;
+  size_t tsize = chunksize(t);
+  bindex_t idx;
+  compute_tree_index(tsize, idx);
+  assert(tindex == idx);
+  assert(tsize >= MIN_LARGE_SIZE);
+  assert(tsize >= minsize_for_tree_index(idx));
+  assert((idx == NTREEBINS-1) || (tsize < minsize_for_tree_index((idx+1))));
+
+  do { /* traverse through chain of same-sized nodes */
+    do_check_any_chunk(m, ((mchunkptr)u));
+    assert(u->index == tindex);
+    assert(chunksize(u) == tsize);
+    assert(!cinuse(u));
+    assert(!next_pinuse(u));
+    assert(u->fd->bk == u);
+    assert(u->bk->fd == u);
+    if (u->parent == 0) {
+      assert(u->child[0] == 0);
+      assert(u->child[1] == 0);
+    }
+    else {
+      assert(head == 0); /* only one node on chain has parent */
+      head = u;
+      assert(u->parent != u);
+      assert (u->parent->child[0] == u ||
+	      u->parent->child[1] == u ||
+	      *((tbinptr*)(u->parent)) == u);
+      if (u->child[0] != 0) {
+	assert(u->child[0]->parent == u);
+	assert(u->child[0] != u);
+	do_check_tree(m, u->child[0]);
+      }
+      if (u->child[1] != 0) {
+	assert(u->child[1]->parent == u);
+	assert(u->child[1] != u);
+	do_check_tree(m, u->child[1]);
+      }
+      if (u->child[0] != 0 && u->child[1] != 0) {
+	assert(chunksize(u->child[0]) < chunksize(u->child[1]));
+      }
+    }
+    u = u->fd;
+  } while (u != t);
+  assert(head != 0);
+}
+
+/*  Check all the chunks in a treebin.  */
+static void do_check_treebin(mstate m, bindex_t i) {
+  tbinptr* tb = treebin_at(m, i);
+  tchunkptr t = *tb;
+  int empty = (m->treemap & (1U << i)) == 0;
+  if (t == 0)
+    assert(empty);
+  if (!empty)
+    do_check_tree(m, t);
+}
+
+/*  Check all the chunks in a smallbin.  */
+static void do_check_smallbin(mstate m, bindex_t i) {
+  sbinptr b = smallbin_at(m, i);
+  mchunkptr p = b->bk;
+  unsigned int empty = (m->smallmap & (1U << i)) == 0;
+  if (p == b)
+    assert(empty);
+  if (!empty) {
+    for (; p != b; p = p->bk) {
+      size_t size = chunksize(p);
+      mchunkptr q;
+      /* each chunk claims to be free */
+      do_check_free_chunk(m, p);
+      /* chunk belongs in bin */
+      assert(small_index(size) == i);
+      assert(p->bk == b || chunksize(p->bk) == chunksize(p));
+      /* chunk is followed by an inuse chunk */
+      q = next_chunk(p);
+      if (q->head != FENCEPOST_HEAD)
+	do_check_inuse_chunk(m, q);
+    }
+  }
+}
+
+/* Find x in a bin. Used in other check functions. */
+static int bin_find(mstate m, mchunkptr x) {
+  size_t size = chunksize(x);
+  if (is_small(size)) {
+    bindex_t sidx = small_index(size);
+    sbinptr b = smallbin_at(m, sidx);
+    if (smallmap_is_marked(m, sidx)) {
+      mchunkptr p = b;
+      do {
+	if (p == x)
+	  return 1;
+      } while ((p = p->fd) != b);
+    }
+  }
+  else {
+    bindex_t tidx;
+    compute_tree_index(size, tidx);
+    if (treemap_is_marked(m, tidx)) {
+      tchunkptr t = *treebin_at(m, tidx);
+      size_t sizebits = size << leftshift_for_tree_index(tidx);
+      while (t != 0 && chunksize(t) != size) {
+	t = t->child[(sizebits >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1];
+	sizebits <<= 1;
+      }
+      if (t != 0) {
+	tchunkptr u = t;
+	do {
+	  if (u == (tchunkptr)x)
+	    return 1;
+	} while ((u = u->fd) != t);
+      }
+    }
+  }
+  return 0;
+}
+
+/* Traverse each chunk and check it; return total */
+static size_t traverse_and_check(mstate m) {
+  size_t sum = 0;
+  if (is_initialized(m)) {
+    msegmentptr s = &m->seg;
+    sum += m->topsize + TOP_FOOT_SIZE;
+    while (s != 0) {
+      mchunkptr q = align_as_chunk(s->base);
+      mchunkptr lastq = 0;
+      assert(pinuse(q));
+      while (segment_holds(s, q) &&
+	     q != m->top && q->head != FENCEPOST_HEAD) {
+	sum += chunksize(q);
+	if (cinuse(q)) {
+	  assert(!bin_find(m, q));
+	  do_check_inuse_chunk(m, q);
+	}
+	else {
+	  assert(q == m->dv || bin_find(m, q));
+	  assert(lastq == 0 || cinuse(lastq)); /* Not 2 consecutive free */
+	  do_check_free_chunk(m, q);
+	}
+	lastq = q;
+	q = next_chunk(q);
+      }
+      s = s->next;
+    }
+  }
+  return sum;
+}
+
+/* Check all properties of malloc_state. */
+static void do_check_malloc_state(mstate m) {
+  bindex_t i;
+  size_t total;
+  /* check bins */
+  for (i = 0; i < NSMALLBINS; ++i)
+    do_check_smallbin(m, i);
+  for (i = 0; i < NTREEBINS; ++i)
+    do_check_treebin(m, i);
+
+  if (m->dvsize != 0) { /* check dv chunk */
+    do_check_any_chunk(m, m->dv);
+    assert(m->dvsize == chunksize(m->dv));
+    assert(m->dvsize >= MIN_CHUNK_SIZE);
+    assert(bin_find(m, m->dv) == 0);
+  }
+
+  if (m->top != 0) {   /* check top chunk */
+    do_check_top_chunk(m, m->top);
+    /*assert(m->topsize == chunksize(m->top)); redundant */
+    assert(m->topsize > 0);
+    assert(bin_find(m, m->top) == 0);
+  }
+
+  total = traverse_and_check(m);
+  assert(total <= m->footprint);
+  assert(m->footprint <= m->max_footprint);
+}
+#endif /* DEBUG */
+
+/* ----------------------------- statistics ------------------------------ */
+
+#if !NO_MALLINFO
+static struct mallinfo internal_mallinfo(mstate m) {
+  struct mallinfo nm = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+  ensure_initialization();
+  if (!PREACTION(m)) {
+    check_malloc_state(m);
+    if (is_initialized(m)) {
+      size_t nfree = SIZE_T_ONE; /* top always free */
+      size_t mfree = m->topsize + TOP_FOOT_SIZE;
+      size_t sum = mfree;
+      msegmentptr s = &m->seg;
+      while (s != 0) {
+	mchunkptr q = align_as_chunk(s->base);
+	while (segment_holds(s, q) &&
+	       q != m->top && q->head != FENCEPOST_HEAD) {
+	  size_t sz = chunksize(q);
+	  sum += sz;
+	  if (!cinuse(q)) {
+	    mfree += sz;
+	    ++nfree;
+	  }
+	  q = next_chunk(q);
+	}
+	s = s->next;
+      }
+
+      nm.arena    = sum;
+      nm.ordblks  = nfree;
+      nm.hblkhd   = m->footprint - sum;
+      nm.usmblks  = m->max_footprint;
+      nm.uordblks = m->footprint - mfree;
+      nm.fordblks = mfree;
+      nm.keepcost = m->topsize;
+    }
+
+    POSTACTION(m);
+  }
+  return nm;
+}
+#endif /* !NO_MALLINFO */
+
+static void internal_malloc_stats(mstate m) {
+  ensure_initialization();
+  if (!PREACTION(m)) {
+    size_t maxfp = 0;
+    size_t fp = 0;
+    size_t used = 0;
+    check_malloc_state(m);
+    if (is_initialized(m)) {
+      msegmentptr s = &m->seg;
+      maxfp = m->max_footprint;
+      fp = m->footprint;
+      used = fp - (m->topsize + TOP_FOOT_SIZE);
+
+      while (s != 0) {
+	mchunkptr q = align_as_chunk(s->base);
+	while (segment_holds(s, q) &&
+	       q != m->top && q->head != FENCEPOST_HEAD) {
+	  if (!cinuse(q))
+	    used -= chunksize(q);
+	  q = next_chunk(q);
+	}
+	s = s->next;
+      }
+    }
+
+    fprintf(stderr, "max system bytes = %10lu\n", (unsigned long)(maxfp));
+    fprintf(stderr, "system bytes     = %10lu\n", (unsigned long)(fp));
+    fprintf(stderr, "in use bytes     = %10lu\n", (unsigned long)(used));
+
+    POSTACTION(m);
+  }
+}
+
+/* ----------------------- Operations on smallbins ----------------------- */
+
+/*
+  Various forms of linking and unlinking are defined as macros.  Even
+  the ones for trees, which are very long but have very short typical
+  paths.  This is ugly but reduces reliance on inlining support of
+  compilers.
+*/
+
+/* Link a free chunk into a smallbin  */
+#define insert_small_chunk(M, P, S) {\
+  bindex_t I  = small_index(S);\
+  mchunkptr B = smallbin_at(M, I);\
+  mchunkptr F = B;\
+  assert(S >= MIN_CHUNK_SIZE);\
+  if (!smallmap_is_marked(M, I))\
+    mark_smallmap(M, I);\
+  else if (RTCHECK(ok_address(M, B->fd)))\
+    F = B->fd;\
+  else {\
+    CORRUPTION_ERROR_ACTION(M);\
+  }\
+  B->fd = P;\
+  F->bk = P;\
+  P->fd = F;\
+  P->bk = B;\
+}
+
+/* Unlink a chunk from a smallbin  */
+#define unlink_small_chunk(M, P, S) {\
+  mchunkptr F = P->fd;\
+  mchunkptr B = P->bk;\
+  bindex_t I = small_index(S);\
+  assert(P != B);\
+  assert(P != F);\
+  assert(chunksize(P) == small_index2size(I));\
+  if (F == B)\
+    clear_smallmap(M, I);\
+  else if (RTCHECK((F == smallbin_at(M,I) || ok_address(M, F)) &&\
+		   (B == smallbin_at(M,I) || ok_address(M, B)))) {\
+    F->bk = B;\
+    B->fd = F;\
+  }\
+  else {\
+    CORRUPTION_ERROR_ACTION(M);\
+  }\
+}
+
+/* Unlink the first chunk from a smallbin */
+#define unlink_first_small_chunk(M, B, P, I) {\
+  mchunkptr F = P->fd;\
+  assert(P != B);\
+  assert(P != F);\
+  assert(chunksize(P) == small_index2size(I));\
+  if (B == F)\
+    clear_smallmap(M, I);\
+  else if (RTCHECK(ok_address(M, F))) {\
+    B->fd = F;\
+    F->bk = B;\
+  }\
+  else {\
+    CORRUPTION_ERROR_ACTION(M);\
+  }\
+}
+
+
+
+/* Replace dv node, binning the old one */
+/* Used only when dvsize known to be small */
+#define replace_dv(M, P, S) {\
+  size_t DVS = M->dvsize;\
+  if (DVS != 0) {\
+    mchunkptr DV = M->dv;\
+    assert(is_small(DVS));\
+    insert_small_chunk(M, DV, DVS);\
+  }\
+  M->dvsize = S;\
+  M->dv = P;\
+}
+
+/* ------------------------- Operations on trees ------------------------- */
+
+/* Insert chunk into tree */
+#define insert_large_chunk(M, X, S) {\
+  tbinptr* H;\
+  bindex_t I;\
+  compute_tree_index(S, I);\
+  H = treebin_at(M, I);\
+  X->index = I;\
+  X->child[0] = X->child[1] = 0;\
+  if (!treemap_is_marked(M, I)) {\
+    mark_treemap(M, I);\
+    *H = X;\
+    X->parent = (tchunkptr)H;\
+    X->fd = X->bk = X;\
+  }\
+  else {\
+    tchunkptr T = *H;\
+    size_t K = S << leftshift_for_tree_index(I);\
+    for (;;) {\
+      if (chunksize(T) != S) {\
+	tchunkptr* C = &(T->child[(K >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1]);\
+	K <<= 1;\
+	if (*C != 0)\
+	  T = *C;\
+	else if (RTCHECK(ok_address(M, C))) {\
+	  *C = X;\
+	  X->parent = T;\
+	  X->fd = X->bk = X;\
+	  break;\
+	}\
+	else {\
+	  CORRUPTION_ERROR_ACTION(M);\
+	  break;\
+	}\
+      }\
+      else {\
+	tchunkptr F = T->fd;\
+	if (RTCHECK(ok_address(M, T) && ok_address(M, F))) {\
+	  T->fd = F->bk = X;\
+	  X->fd = F;\
+	  X->bk = T;\
+	  X->parent = 0;\
+	  break;\
+	}\
+	else {\
+	  CORRUPTION_ERROR_ACTION(M);\
+	  break;\
+	}\
+      }\
+    }\
+  }\
+}
+
+/*
+  Unlink steps:
+
+  1. If x is a chained node, unlink it from its same-sized fd/bk links
+     and choose its bk node as its replacement.
+  2. If x was the last node of its size, but not a leaf node, it must
+     be replaced with a leaf node (not merely one with an open left or
+     right), to make sure that lefts and rights of descendants
+     correspond properly to bit masks.  We use the rightmost descendant
+     of x.  We could use any other leaf, but this is easy to locate and
+     tends to counteract removal of leftmosts elsewhere, and so keeps
+     paths shorter than minimally guaranteed.  This doesn't loop much
+     because on average a node in a tree is near the bottom.
+  3. If x is the base of a chain (i.e., has parent links) relink
+     x's parent and children to x's replacement (or null if none).
+*/
+
+#define unlink_large_chunk(M, X) {\
+  tchunkptr XP = X->parent;\
+  tchunkptr R;\
+  if (X->bk != X) {\
+    tchunkptr F = X->fd;\
+    R = X->bk;\
+    if (RTCHECK(ok_address(M, F))) {\
+      F->bk = R;\
+      R->fd = F;\
+    }\
+    else {\
+      CORRUPTION_ERROR_ACTION(M);\
+    }\
+  }\
+  else {\
+    tchunkptr* RP;\
+    if (((R = *(RP = &(X->child[1]))) != 0) ||\
+	((R = *(RP = &(X->child[0]))) != 0)) {\
+      tchunkptr* CP;\
+      while ((*(CP = &(R->child[1])) != 0) ||\
+	     (*(CP = &(R->child[0])) != 0)) {\
+	R = *(RP = CP);\
+      }\
+      if (RTCHECK(ok_address(M, RP)))\
+	*RP = 0;\
+      else {\
+	CORRUPTION_ERROR_ACTION(M);\
+      }\
+    }\
+  }\
+  if (XP != 0) {\
+    tbinptr* H = treebin_at(M, X->index);\
+    if (X == *H) {\
+      if ((*H = R) == 0) \
+	clear_treemap(M, X->index);\
+    }\
+    else if (RTCHECK(ok_address(M, XP))) {\
+      if (XP->child[0] == X) \
+	XP->child[0] = R;\
+      else \
+	XP->child[1] = R;\
+    }\
+    else\
+      CORRUPTION_ERROR_ACTION(M);\
+    if (R != 0) {\
+      if (RTCHECK(ok_address(M, R))) {\
+	tchunkptr C0, C1;\
+	R->parent = XP;\
+	if ((C0 = X->child[0]) != 0) {\
+	  if (RTCHECK(ok_address(M, C0))) {\
+	    R->child[0] = C0;\
+	    C0->parent = R;\
+	  }\
+	  else\
+	    CORRUPTION_ERROR_ACTION(M);\
+	}\
+	if ((C1 = X->child[1]) != 0) {\
+	  if (RTCHECK(ok_address(M, C1))) {\
+	    R->child[1] = C1;\
+	    C1->parent = R;\
+	  }\
+	  else\
+	    CORRUPTION_ERROR_ACTION(M);\
+	}\
+      }\
+      else\
+	CORRUPTION_ERROR_ACTION(M);\
+    }\
+  }\
+}
+
+/* Relays to large vs small bin operations */
+
+#define insert_chunk(M, P, S)\
+  if (is_small(S)) insert_small_chunk(M, P, S)\
+  else { tchunkptr TP = (tchunkptr)(P); insert_large_chunk(M, TP, S); }
+
+#define unlink_chunk(M, P, S)\
+  if (is_small(S)) unlink_small_chunk(M, P, S)\
+  else { tchunkptr TP = (tchunkptr)(P); unlink_large_chunk(M, TP); }
+
+
+/* Relays to internal calls to malloc/free from realloc, memalign etc */
+
+#if ONLY_MSPACES
+#define internal_malloc(m, b) mspace_malloc(m, b)
+#define internal_free(m, mem) mspace_free(m,mem);
+#else /* ONLY_MSPACES */
+#if MSPACES
+#define internal_malloc(m, b)\
+   (m == gm)? dlmalloc(b) : mspace_malloc(m, b)
+#define internal_free(m, mem)\
+   if (m == gm) dlfree(mem); else mspace_free(m,mem);
+#else /* MSPACES */
+#define internal_malloc(m, b) dlmalloc(b)
+#define internal_free(m, mem) dlfree(mem)
+#endif /* MSPACES */
+#endif /* ONLY_MSPACES */
+
+/* -----------------------  Direct-mmapping chunks ----------------------- */
+
+/*
+  Directly mmapped chunks are set up with an offset to the start of
+  the mmapped region stored in the prev_foot field of the chunk. This
+  allows reconstruction of the required argument to MUNMAP when freed,
+  and also allows adjustment of the returned chunk to meet alignment
+  requirements (especially in memalign).  There is also enough space
+  allocated to hold a fake next chunk of size SIZE_T_SIZE to maintain
+  the PINUSE bit so frees can be checked.
+*/
+
+/* Malloc using mmap */
+static void* mmap_alloc(mstate m, size_t nb) {
+  size_t mmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
+  if (mmsize > nb) {     /* Check for wrap around 0 */
+    char* mm = (char*)(CALL_DIRECT_MMAP(mmsize));
+    if (mm != CMFAIL) {
+      size_t offset = align_offset(chunk2mem(mm));
+      size_t psize = mmsize - offset - MMAP_FOOT_PAD;
+      mchunkptr p = (mchunkptr)(mm + offset);
+      p->prev_foot = offset | IS_MMAPPED_BIT;
+      (p)->head = (psize|CINUSE_BIT);
+      mark_inuse_foot(m, p, psize);
+      chunk_plus_offset(p, psize)->head = FENCEPOST_HEAD;
+      chunk_plus_offset(p, psize+SIZE_T_SIZE)->head = 0;
+
+      if (mm < m->least_addr)
+	m->least_addr = mm;
+      if ((m->footprint += mmsize) > m->max_footprint)
+	m->max_footprint = m->footprint;
+      assert(is_aligned(chunk2mem(p)));
+      check_mmapped_chunk(m, p);
+      return chunk2mem(p);
+    }
+  }
+  return 0;
+}
+
+/* Realloc using mmap */
+static mchunkptr mmap_resize(mstate m, mchunkptr oldp, size_t nb) {
+  size_t oldsize = chunksize(oldp);
+  if (is_small(nb)) /* Can't shrink mmap regions below small size */
+    return 0;
+  /* Keep old chunk if big enough but not too big */
+  if (oldsize >= nb + SIZE_T_SIZE &&
+      (oldsize - nb) <= (mparams.granularity << 1))
+    return oldp;
+  else {
+    size_t offset = oldp->prev_foot & ~IS_MMAPPED_BIT;
+    size_t oldmmsize = oldsize + offset + MMAP_FOOT_PAD;
+    size_t newmmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
+    char* cp = (char*)CALL_MREMAP((char*)oldp - offset,
+				  oldmmsize, newmmsize, 1);
+    if (cp != CMFAIL) {
+      mchunkptr newp = (mchunkptr)(cp + offset);
+      size_t psize = newmmsize - offset - MMAP_FOOT_PAD;
+      newp->head = (psize|CINUSE_BIT);
+      mark_inuse_foot(m, newp, psize);
+      chunk_plus_offset(newp, psize)->head = FENCEPOST_HEAD;
+      chunk_plus_offset(newp, psize+SIZE_T_SIZE)->head = 0;
+
+      if (cp < m->least_addr)
+	m->least_addr = cp;
+      if ((m->footprint += newmmsize - oldmmsize) > m->max_footprint)
+	m->max_footprint = m->footprint;
+      check_mmapped_chunk(m, newp);
+      return newp;
+    }
+  }
+  return 0;
+}
+
+/* -------------------------- mspace management -------------------------- */
+
+/* Initialize top chunk and its size */
+static void init_top(mstate m, mchunkptr p, size_t psize) {
+  /* Ensure alignment */
+  size_t offset = align_offset(chunk2mem(p));
+  p = (mchunkptr)((char*)p + offset);
+  psize -= offset;
+
+  m->top = p;
+  m->topsize = psize;
+  p->head = psize | PINUSE_BIT;
+  /* set size of fake trailing chunk holding overhead space only once */
+  chunk_plus_offset(p, psize)->head = TOP_FOOT_SIZE;
+  m->trim_check = mparams.trim_threshold; /* reset on each update */
+}
+
+/* Initialize bins for a new mstate that is otherwise zeroed out */
+static void init_bins(mstate m) {
+  /* Establish circular links for smallbins */
+  bindex_t i;
+  for (i = 0; i < NSMALLBINS; ++i) {
+    sbinptr bin = smallbin_at(m,i);
+    bin->fd = bin->bk = bin;
+  }
+}
+
+#if PROCEED_ON_ERROR
+
+/* default corruption action */
+static void reset_on_error(mstate m) {
+  int i;
+  ++malloc_corruption_error_count;
+  /* Reinitialize fields to forget about all memory */
+  m->smallbins = m->treebins = 0;
+  m->dvsize = m->topsize = 0;
+  m->seg.base = 0;
+  m->seg.size = 0;
+  m->seg.next = 0;
+  m->top = m->dv = 0;
+  for (i = 0; i < NTREEBINS; ++i)
+    *treebin_at(m, i) = 0;
+  init_bins(m);
+}
+#endif /* PROCEED_ON_ERROR */
+
+/* Allocate chunk and prepend remainder with chunk in successor base. */
+static void* prepend_alloc(mstate m, char* newbase, char* oldbase,
+			   size_t nb) {
+  mchunkptr p = align_as_chunk(newbase);
+  mchunkptr oldfirst = align_as_chunk(oldbase);
+  size_t psize = (char*)oldfirst - (char*)p;
+  mchunkptr q = chunk_plus_offset(p, nb);
+  size_t qsize = psize - nb;
+  set_size_and_pinuse_of_inuse_chunk(m, p, nb);
+
+  assert((char*)oldfirst > (char*)q);
+  assert(pinuse(oldfirst));
+  assert(qsize >= MIN_CHUNK_SIZE);
+
+  /* consolidate remainder with first chunk of old base */
+  if (oldfirst == m->top) {
+    size_t tsize = m->topsize += qsize;
+    m->top = q;
+    q->head = tsize | PINUSE_BIT;
+    check_top_chunk(m, q);
+  }
+  else if (oldfirst == m->dv) {
+    size_t dsize = m->dvsize += qsize;
+    m->dv = q;
+    set_size_and_pinuse_of_free_chunk(q, dsize);
+  }
+  else {
+    if (!cinuse(oldfirst)) {
+      size_t nsize = chunksize(oldfirst);
+      unlink_chunk(m, oldfirst, nsize);
+      oldfirst = chunk_plus_offset(oldfirst, nsize);
+      qsize += nsize;
+    }
+    set_free_with_pinuse(q, qsize, oldfirst);
+    insert_chunk(m, q, qsize);
+    check_free_chunk(m, q);
+  }
+
+  check_malloced_chunk(m, chunk2mem(p), nb);
+  return chunk2mem(p);
+}
+
+/* Add a segment to hold a new noncontiguous region */
+static void add_segment(mstate m, char* tbase, size_t tsize, flag_t mmapped) {
+  /* Determine locations and sizes of segment, fenceposts, old top */
+  char* old_top = (char*)m->top;
+  msegmentptr oldsp = segment_holding(m, old_top);
+  char* old_end = oldsp->base + oldsp->size;
+  size_t ssize = pad_request(sizeof(struct malloc_segment));
+  char* rawsp = old_end - (ssize + FOUR_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
+  size_t offset = align_offset(chunk2mem(rawsp));
+  char* asp = rawsp + offset;
+  char* csp = (asp < (old_top + MIN_CHUNK_SIZE))? old_top : asp;
+  mchunkptr sp = (mchunkptr)csp;
+  msegmentptr ss = (msegmentptr)(chunk2mem(sp));
+  mchunkptr tnext = chunk_plus_offset(sp, ssize);
+  mchunkptr p = tnext;
+  int nfences = 0;
+
+  /* reset top to new space */
+  init_top(m, (mchunkptr)tbase, tsize - TOP_FOOT_SIZE);
+
+  /* Set up segment record */
+  assert(is_aligned(ss));
+  set_size_and_pinuse_of_inuse_chunk(m, sp, ssize);
+  *ss = m->seg; /* Push current record */
+  m->seg.base = tbase;
+  m->seg.size = tsize;
+  m->seg.sflags = mmapped;
+  m->seg.next = ss;
+
+  /* Insert trailing fenceposts */
+  for (;;) {
+    mchunkptr nextp = chunk_plus_offset(p, SIZE_T_SIZE);
+    p->head = FENCEPOST_HEAD;
+    ++nfences;
+    if ((char*)(&(nextp->head)) < old_end)
+      p = nextp;
+    else
+      break;
+  }
+  assert(nfences >= 2);
+
+  /* Insert the rest of old top into a bin as an ordinary free chunk */
+  if (csp != old_top) {
+    mchunkptr q = (mchunkptr)old_top;
+    size_t psize = csp - old_top;
+    mchunkptr tn = chunk_plus_offset(q, psize);
+    set_free_with_pinuse(q, psize, tn);
+    insert_chunk(m, q, psize);
+  }
+
+  check_top_chunk(m, m->top);
+}
+
+/* -------------------------- System allocation -------------------------- */
+
+/* Get memory from system using MORECORE or MMAP */
+static void* sys_alloc(mstate m, size_t nb) {
+  char* tbase = CMFAIL;
+  size_t tsize = 0;
+  flag_t mmap_flag = 0;
+
+  ensure_initialization();
+
+  /* Directly map large chunks */
+  if (use_mmap(m) && nb >= mparams.mmap_threshold) {
+    void* mem = mmap_alloc(m, nb);
+    if (mem != 0)
+      return mem;
+  }
+
+  /*
+    Try getting memory in any of three ways (in most-preferred to
+    least-preferred order):
+    1. A call to MORECORE that can normally contiguously extend memory.
+       (disabled if not MORECORE_CONTIGUOUS or not HAVE_MORECORE or
+       main space is mmapped or a previous contiguous call failed)
+    2. A call to MMAP new space (disabled if not HAVE_MMAP).
+       Note that under the default settings, if MORECORE is unable to
+       fulfill a request, and HAVE_MMAP is true, then mmap is
+       used as a noncontiguous system allocator. This is a useful backup
+       strategy for systems with holes in address spaces -- in this case
+       sbrk cannot contiguously expand the heap, but mmap may be able to
+       find space.
+    3. A call to MORECORE that cannot usually contiguously extend memory.
+       (disabled if not HAVE_MORECORE)
+
+   In all cases, we need to request enough bytes from system to ensure
+   we can malloc nb bytes upon success, so pad with enough space for
+   top_foot, plus alignment-pad to make sure we don't lose bytes if
+   not on boundary, and round this up to a granularity unit.
+  */
+
+  if (MORECORE_CONTIGUOUS && !use_noncontiguous(m)) {
+    char* br = CMFAIL;
+    msegmentptr ss = (m->top == 0)? 0 : segment_holding(m, (char*)m->top);
+    size_t asize = 0;
+    ACQUIRE_MALLOC_GLOBAL_LOCK();
+
+    if (ss == 0) {  /* First time through or recovery */
+      char* base = (char*)CALL_MORECORE(0);
+      if (base != CMFAIL) {
+	asize = granularity_align(nb + SYS_ALLOC_PADDING);
+	/* Adjust to end on a page boundary */
+	if (!is_page_aligned(base))
+	  asize += (page_align((size_t)base) - (size_t)base);
+	/* Can't call MORECORE if size is negative when treated as signed */
+	if (asize < HALF_MAX_SIZE_T &&
+	    (br = (char*)(CALL_MORECORE(asize))) == base) {
+	  tbase = base;
+	  tsize = asize;
+	}
+      }
+    }
+    else {
+      /* Subtract out existing available top space from MORECORE request. */
+      asize = granularity_align(nb - m->topsize + SYS_ALLOC_PADDING);
+      /* Use mem here only if it did continuously extend old space */
+      if (asize < HALF_MAX_SIZE_T &&
+	  (br = (char*)(CALL_MORECORE(asize))) == ss->base+ss->size) {
+	tbase = br;
+	tsize = asize;
+      }
+    }
+
+    if (tbase == CMFAIL) {    /* Cope with partial failure */
+      if (br != CMFAIL) {    /* Try to use/extend the space we did get */
+	if (asize < HALF_MAX_SIZE_T &&
+	    asize < nb + SYS_ALLOC_PADDING) {
+	  size_t esize = granularity_align(nb + SYS_ALLOC_PADDING - asize);
+	  if (esize < HALF_MAX_SIZE_T) {
+	    char* end = (char*)CALL_MORECORE(esize);
+	    if (end != CMFAIL)
+	      asize += esize;
+	    else {            /* Can't use; try to release */
+	      (void) CALL_MORECORE(-asize);
+	      br = CMFAIL;
+	    }
+	  }
+	}
+      }
+      if (br != CMFAIL) {    /* Use the space we did get */
+	tbase = br;
+	tsize = asize;
+      }
+      else
+	disable_contiguous(m); /* Don't try contiguous path in the future */
+    }
+
+    RELEASE_MALLOC_GLOBAL_LOCK();
+  }
+
+  if (HAVE_MMAP && tbase == CMFAIL) {  /* Try MMAP */
+    size_t rsize = granularity_align(nb + SYS_ALLOC_PADDING);
+    if (rsize > nb) { /* Fail if wraps around zero */
+      char* mp = (char*)(CALL_MMAP(rsize));
+      if (mp != CMFAIL) {
+	tbase = mp;
+	tsize = rsize;
+	mmap_flag = IS_MMAPPED_BIT;
+      }
+    }
+  }
+
+  if (HAVE_MORECORE && tbase == CMFAIL) { /* Try noncontiguous MORECORE */
+    size_t asize = granularity_align(nb + SYS_ALLOC_PADDING);
+    if (asize < HALF_MAX_SIZE_T) {
+      char* br = CMFAIL;
+      char* end = CMFAIL;
+      ACQUIRE_MALLOC_GLOBAL_LOCK();
+      br = (char*)(CALL_MORECORE(asize));
+      end = (char*)(CALL_MORECORE(0));
+      RELEASE_MALLOC_GLOBAL_LOCK();
+      if (br != CMFAIL && end != CMFAIL && br < end) {
+	size_t ssize = end - br;
+	if (ssize > nb + TOP_FOOT_SIZE) {
+	  tbase = br;
+	  tsize = ssize;
+	}
+      }
+    }
+  }
+
+  if (tbase != CMFAIL) {
+
+    if ((m->footprint += tsize) > m->max_footprint)
+      m->max_footprint = m->footprint;
+
+    if (!is_initialized(m)) { /* first-time initialization */
+      m->seg.base = m->least_addr = tbase;
+      m->seg.size = tsize;
+      m->seg.sflags = mmap_flag;
+      m->magic = mparams.magic;
+      m->release_checks = MAX_RELEASE_CHECK_RATE;
+      init_bins(m);
+#if !ONLY_MSPACES
+      if (is_global(m))
+	init_top(m, (mchunkptr)tbase, tsize - TOP_FOOT_SIZE);
+      else
+#endif
+      {
+	/* Offset top by embedded malloc_state */
+	mchunkptr mn = next_chunk(mem2chunk(m));
+	init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) -TOP_FOOT_SIZE);
+      }
+    }
+
+    else {
+      /* Try to merge with an existing segment */
+      msegmentptr sp = &m->seg;
+      /* Only consider most recent segment if traversal suppressed */
+      while (sp != 0 && tbase != sp->base + sp->size)
+	sp = (NO_SEGMENT_TRAVERSAL) ? 0 : sp->next;
+      if (sp != 0 &&
+	  !is_extern_segment(sp) &&
+	  (sp->sflags & IS_MMAPPED_BIT) == mmap_flag &&
+	  segment_holds(sp, m->top)) { /* append */
+	sp->size += tsize;
+	init_top(m, m->top, m->topsize + tsize);
+      }
+      else {
+	if (tbase < m->least_addr)
+	  m->least_addr = tbase;
+	sp = &m->seg;
+	while (sp != 0 && sp->base != tbase + tsize)
+	  sp = (NO_SEGMENT_TRAVERSAL) ? 0 : sp->next;
+	if (sp != 0 &&
+	    !is_extern_segment(sp) &&
+	    (sp->sflags & IS_MMAPPED_BIT) == mmap_flag) {
+	  char* oldbase = sp->base;
+	  sp->base = tbase;
+	  sp->size += tsize;
+	  return prepend_alloc(m, tbase, oldbase, nb);
+	}
+	else
+	  add_segment(m, tbase, tsize, mmap_flag);
+      }
+    }
+
+    if (nb < m->topsize) { /* Allocate from new or extended top space */
+      size_t rsize = m->topsize -= nb;
+      mchunkptr p = m->top;
+      mchunkptr r = m->top = chunk_plus_offset(p, nb);
+      r->head = rsize | PINUSE_BIT;
+      set_size_and_pinuse_of_inuse_chunk(m, p, nb);
+      check_top_chunk(m, m->top);
+      check_malloced_chunk(m, chunk2mem(p), nb);
+      return chunk2mem(p);
+    }
+  }
+
+  MALLOC_FAILURE_ACTION;
+  return 0;
+}
+
+/* -----------------------  system deallocation -------------------------- */
+
+/* Unmap and unlink any mmapped segments that don't contain used chunks */
+static size_t release_unused_segments(mstate m) {
+  size_t released = 0;
+  int nsegs = 0;
+  msegmentptr pred = &m->seg;
+  msegmentptr sp = pred->next;
+  while (sp != 0) {
+    char* base = sp->base;
+    size_t size = sp->size;
+    msegmentptr next = sp->next;
+    ++nsegs;
+    if (is_mmapped_segment(sp) && !is_extern_segment(sp)) {
+      mchunkptr p = align_as_chunk(base);
+      size_t psize = chunksize(p);
+      /* Can unmap if first chunk holds entire segment and not pinned */
+      if (!cinuse(p) && (char*)p + psize >= base + size - TOP_FOOT_SIZE) {
+	tchunkptr tp = (tchunkptr)p;
+	assert(segment_holds(sp, (char*)sp));
+	if (p == m->dv) {
+	  m->dv = 0;
+	  m->dvsize = 0;
+	}
+	else {
+	  unlink_large_chunk(m, tp);
+	}
+	if (CALL_MUNMAP(base, size) == 0) {
+	  released += size;
+	  m->footprint -= size;
+	  /* unlink obsoleted record */
+	  sp = pred;
+	  sp->next = next;
+	}
+	else { /* back out if cannot unmap */
+	  insert_large_chunk(m, tp, psize);
+	}
+      }
+    }
+    if (NO_SEGMENT_TRAVERSAL) /* scan only first segment */
+      break;
+    pred = sp;
+    sp = next;
+  }
+  /* Reset check counter */
+  m->release_checks = ((nsegs > MAX_RELEASE_CHECK_RATE)?
+		       nsegs : MAX_RELEASE_CHECK_RATE);
+  return released;
+}
+
+static int sys_trim(mstate m, size_t pad) {
+  size_t released = 0;
+  ensure_initialization();
+  if (pad < MAX_REQUEST && is_initialized(m)) {
+    pad += TOP_FOOT_SIZE; /* ensure enough room for segment overhead */
+
+    if (m->topsize > pad) {
+      /* Shrink top space in granularity-size units, keeping at least one */
+      size_t unit = mparams.granularity;
+      size_t extra = ((m->topsize - pad + (unit - SIZE_T_ONE)) / unit -
+		      SIZE_T_ONE) * unit;
+      msegmentptr sp = segment_holding(m, (char*)m->top);
+
+      if (!is_extern_segment(sp)) {
+	if (is_mmapped_segment(sp)) {
+	  if (HAVE_MMAP &&
+	      sp->size >= extra &&
+	      !has_segment_link(m, sp)) { /* can't shrink if pinned */
+	    size_t newsize = sp->size - extra;
+	    /* Prefer mremap, fall back to munmap */
+	    if ((CALL_MREMAP(sp->base, sp->size, newsize, 0) != MFAIL) ||
+		(CALL_MUNMAP(sp->base + newsize, extra) == 0)) {
+	      released = extra;
+	    }
+	  }
+	}
+	else if (HAVE_MORECORE) {
+	  if (extra >= HALF_MAX_SIZE_T) /* Avoid wrapping negative */
+	    extra = (HALF_MAX_SIZE_T) + SIZE_T_ONE - unit;
+	  ACQUIRE_MALLOC_GLOBAL_LOCK();
+	  {
+	    /* Make sure end of memory is where we last set it. */
+	    char* old_br = (char*)(CALL_MORECORE(0));
+	    if (old_br == sp->base + sp->size) {
+	      char* rel_br = (char*)(CALL_MORECORE(-extra));
+	      char* new_br = (char*)(CALL_MORECORE(0));
+	      if (rel_br != CMFAIL && new_br < old_br)
+		released = old_br - new_br;
+	    }
+	  }
+	  RELEASE_MALLOC_GLOBAL_LOCK();
+	}
+      }
+
+      if (released != 0) {
+	sp->size -= released;
+	m->footprint -= released;
+	init_top(m, m->top, m->topsize - released);
+	check_top_chunk(m, m->top);
+      }
+    }
+
+    /* Unmap any unused mmapped segments */
+    if (HAVE_MMAP)
+      released += release_unused_segments(m);
+
+    /* On failure, disable autotrim to avoid repeated failed future calls */
+    if (released == 0 && m->topsize > m->trim_check)
+      m->trim_check = MAX_SIZE_T;
+  }
+
+  return (released != 0)? 1 : 0;
+}
+
+
+/* ---------------------------- malloc support --------------------------- */
+
+/* allocate a large request from the best fitting chunk in a treebin */
+static void* tmalloc_large(mstate m, size_t nb) {
+  tchunkptr v = 0;
+  size_t rsize = -nb; /* Unsigned negation */
+  tchunkptr t;
+  bindex_t idx;
+  compute_tree_index(nb, idx);
+  if ((t = *treebin_at(m, idx)) != 0) {
+    /* Traverse tree for this bin looking for node with size == nb */
+    size_t sizebits = nb << leftshift_for_tree_index(idx);
+    tchunkptr rst = 0;  /* The deepest untaken right subtree */
+    for (;;) {
+      tchunkptr rt;
+      size_t trem = chunksize(t) - nb;
+      if (trem < rsize) {
+	v = t;
+	if ((rsize = trem) == 0)
+	  break;
+      }
+      rt = t->child[1];
+      t = t->child[(sizebits >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1];
+      if (rt != 0 && rt != t)
+	rst = rt;
+      if (t == 0) {
+	t = rst; /* set t to least subtree holding sizes > nb */
+	break;
+      }
+      sizebits <<= 1;
+    }
+  }
+  if (t == 0 && v == 0) { /* set t to root of next non-empty treebin */
+    binmap_t leftbits = left_bits(idx2bit(idx)) & m->treemap;
+    if (leftbits != 0) {
+      bindex_t i;
+      binmap_t leastbit = least_bit(leftbits);
+      compute_bit2idx(leastbit, i);
+      t = *treebin_at(m, i);
+    }
+  }
+
+  while (t != 0) { /* find smallest of tree or subtree */
+    size_t trem = chunksize(t) - nb;
+    if (trem < rsize) {
+      rsize = trem;
+      v = t;
+    }
+    t = leftmost_child(t);
+  }
+
+  /*  If dv is a better fit, return 0 so malloc will use it */
+  if (v != 0 && rsize < (size_t)(m->dvsize - nb)) {
+    if (RTCHECK(ok_address(m, v))) { /* split */
+      mchunkptr r = chunk_plus_offset(v, nb);
+      assert(chunksize(v) == rsize + nb);
+      if (RTCHECK(ok_next(v, r))) {
+	unlink_large_chunk(m, v);
+	if (rsize < MIN_CHUNK_SIZE)
+	  set_inuse_and_pinuse(m, v, (rsize + nb));
+	else {
+	  set_size_and_pinuse_of_inuse_chunk(m, v, nb);
+	  set_size_and_pinuse_of_free_chunk(r, rsize);
+	  insert_chunk(m, r, rsize);
+	}
+	return chunk2mem(v);
+      }
+    }
+    CORRUPTION_ERROR_ACTION(m);
+  }
+  return 0;
+}
+
+/* allocate a small request from the best fitting chunk in a treebin */
+static void* tmalloc_small(mstate m, size_t nb) {
+  tchunkptr t, v;
+  size_t rsize;
+  bindex_t i;
+  binmap_t leastbit = least_bit(m->treemap);
+  compute_bit2idx(leastbit, i);
+  v = t = *treebin_at(m, i);
+  rsize = chunksize(t) - nb;
+
+  while ((t = leftmost_child(t)) != 0) {
+    size_t trem = chunksize(t) - nb;
+    if (trem < rsize) {
+      rsize = trem;
+      v = t;
+    }
+  }
+
+  if (RTCHECK(ok_address(m, v))) {
+    mchunkptr r = chunk_plus_offset(v, nb);
+    assert(chunksize(v) == rsize + nb);
+    if (RTCHECK(ok_next(v, r))) {
+      unlink_large_chunk(m, v);
+      if (rsize < MIN_CHUNK_SIZE)
+	set_inuse_and_pinuse(m, v, (rsize + nb));
+      else {
+	set_size_and_pinuse_of_inuse_chunk(m, v, nb);
+	set_size_and_pinuse_of_free_chunk(r, rsize);
+	replace_dv(m, r, rsize);
+      }
+      return chunk2mem(v);
+    }
+  }
+
+  CORRUPTION_ERROR_ACTION(m);
+  return 0;
+}
+
+/* --------------------------- realloc support --------------------------- */
+
+static void* internal_realloc(mstate m, void* oldmem, size_t bytes) {
+  if (bytes >= MAX_REQUEST) {
+    MALLOC_FAILURE_ACTION;
+    return 0;
+  }
+  if (!PREACTION(m)) {
+    mchunkptr oldp = mem2chunk(oldmem);
+    size_t oldsize = chunksize(oldp);
+    mchunkptr next = chunk_plus_offset(oldp, oldsize);
+    mchunkptr newp = 0;
+    void* extra = 0;
+
+    /* Try to either shrink or extend into top. Else malloc-copy-free */
+
+    if (RTCHECK(ok_address(m, oldp) && ok_cinuse(oldp) &&
+		ok_next(oldp, next) && ok_pinuse(next))) {
+      size_t nb = request2size(bytes);
+      if (is_mmapped(oldp))
+	newp = mmap_resize(m, oldp, nb);
+      else if (oldsize >= nb) { /* already big enough */
+	size_t rsize = oldsize - nb;
+	newp = oldp;
+	if (rsize >= MIN_CHUNK_SIZE) {
+	  mchunkptr remainder = chunk_plus_offset(newp, nb);
+	  set_inuse(m, newp, nb);
+	  set_inuse(m, remainder, rsize);
+	  extra = chunk2mem(remainder);
+	}
+      }
+      else if (next == m->top && oldsize + m->topsize > nb) {
+	/* Expand into top */
+	size_t newsize = oldsize + m->topsize;
+	size_t newtopsize = newsize - nb;
+	mchunkptr newtop = chunk_plus_offset(oldp, nb);
+	set_inuse(m, oldp, nb);
+	newtop->head = newtopsize |PINUSE_BIT;
+	m->top = newtop;
+	m->topsize = newtopsize;
+	newp = oldp;
+      }
+    }
+    else {
+      USAGE_ERROR_ACTION(m, oldmem);
+      POSTACTION(m);
+      return 0;
+    }
+
+    POSTACTION(m);
+
+    if (newp != 0) {
+      if (extra != 0) {
+	internal_free(m, extra);
+      }
+      check_inuse_chunk(m, newp);
+      return chunk2mem(newp);
+    }
+    else {
+      void* newmem = internal_malloc(m, bytes);
+      if (newmem != 0) {
+	size_t oc = oldsize - overhead_for(oldp);
+	memcpy(newmem, oldmem, (oc < bytes)? oc : bytes);
+	internal_free(m, oldmem);
+      }
+      return newmem;
+    }
+  }
+  return 0;
+}
+
+/* --------------------------- memalign support -------------------------- */
+
+static void* internal_memalign(mstate m, size_t alignment, size_t bytes) {
+  if (alignment <= MALLOC_ALIGNMENT)    /* Can just use malloc */
+    return internal_malloc(m, bytes);
+  if (alignment <  MIN_CHUNK_SIZE) /* must be at least a minimum chunk size */
+    alignment = MIN_CHUNK_SIZE;
+  if ((alignment & (alignment-SIZE_T_ONE)) != 0) {/* Ensure a power of 2 */
+    size_t a = MALLOC_ALIGNMENT << 1;
+    while (a < alignment) a <<= 1;
+    alignment = a;
+  }
+
+  if (bytes >= MAX_REQUEST - alignment) {
+    if (m != 0)  { /* Test isn't needed but avoids compiler warning */
+      MALLOC_FAILURE_ACTION;
+    }
+  }
+  else {
+    size_t nb = request2size(bytes);
+    size_t req = nb + alignment + MIN_CHUNK_SIZE - CHUNK_OVERHEAD;
+    char* mem = (char*)internal_malloc(m, req);
+    if (mem != 0) {
+      void* leader = 0;
+      void* trailer = 0;
+      mchunkptr p = mem2chunk(mem);
+
+      if (PREACTION(m)) return 0;
+      if ((((size_t)(mem)) % alignment) != 0) { /* misaligned */
+	/*
+	  Find an aligned spot inside chunk.  Since we need to give
+	  back leading space in a chunk of at least MIN_CHUNK_SIZE, if
+	  the first calculation places us at a spot with less than
+	  MIN_CHUNK_SIZE leader, we can move to the next aligned spot.
+	  We've allocated enough total room so that this is always
+	  possible.
+	*/
+	char* br = (char*)mem2chunk((size_t)(((size_t)(mem +
+						       alignment -
+						       SIZE_T_ONE)) &
+					     -alignment));
+	char* pos = ((size_t)(br - (char*)(p)) >= MIN_CHUNK_SIZE)?
+	  br : br+alignment;
+	mchunkptr newp = (mchunkptr)pos;
+	size_t leadsize = pos - (char*)(p);
+	size_t newsize = chunksize(p) - leadsize;
+
+	if (is_mmapped(p)) { /* For mmapped chunks, just adjust offset */
+	  newp->prev_foot = p->prev_foot + leadsize;
+	  newp->head = (newsize|CINUSE_BIT);
+	}
+	else { /* Otherwise, give back leader, use the rest */
+	  set_inuse(m, newp, newsize);
+	  set_inuse(m, p, leadsize);
+	  leader = chunk2mem(p);
+	}
+	p = newp;
+      }
+
+      /* Give back spare room at the end */
+      if (!is_mmapped(p)) {
+	size_t size = chunksize(p);
+	if (size > nb + MIN_CHUNK_SIZE) {
+	  size_t remainder_size = size - nb;
+	  mchunkptr remainder = chunk_plus_offset(p, nb);
+	  set_inuse(m, p, nb);
+	  set_inuse(m, remainder, remainder_size);
+	  trailer = chunk2mem(remainder);
+	}
+      }
+
+      assert (chunksize(p) >= nb);
+      assert((((size_t)(chunk2mem(p))) % alignment) == 0);
+      check_inuse_chunk(m, p);
+      POSTACTION(m);
+      if (leader != 0) {
+	internal_free(m, leader);
+      }
+      if (trailer != 0) {
+	internal_free(m, trailer);
+      }
+      return chunk2mem(p);
+    }
+  }
+  return 0;
+}
+
+/* ------------------------ comalloc/coalloc support --------------------- */
+
+static void** ialloc(mstate m,
+		     size_t n_elements,
+		     size_t* sizes,
+		     int opts,
+		     void* chunks[]) {
+  /*
+    This provides common support for independent_X routines, handling
+    all of the combinations that can result.
+
+    The opts arg has:
+    bit 0 set if all elements are same size (using sizes[0])
+    bit 1 set if elements should be zeroed
+  */
+
+  size_t    element_size;   /* chunksize of each element, if all same */
+  size_t    contents_size;  /* total size of elements */
+  size_t    array_size;     /* request size of pointer array */
+  void*     mem;            /* malloced aggregate space */
+  mchunkptr p;              /* corresponding chunk */
+  size_t    remainder_size; /* remaining bytes while splitting */
+  void**    marray;         /* either "chunks" or malloced ptr array */
+  mchunkptr array_chunk;    /* chunk for malloced ptr array */
+  flag_t    was_enabled;    /* to disable mmap */
+  size_t    size;
+  size_t    i;
+
+  ensure_initialization();
+  /* compute array length, if needed */
+  if (chunks != 0) {
+    if (n_elements == 0)
+      return chunks; /* nothing to do */
+    marray = chunks;
+    array_size = 0;
+  }
+  else {
+    /* if empty req, must still return chunk representing empty array */
+    if (n_elements == 0)
+      return (void**)internal_malloc(m, 0);
+    marray = 0;
+    array_size = request2size(n_elements * (sizeof(void*)));
+  }
+
+  /* compute total element size */
+  if (opts & 0x1) { /* all-same-size */
+    element_size = request2size(*sizes);
+    contents_size = n_elements * element_size;
+  }
+  else { /* add up all the sizes */
+    element_size = 0;
+    contents_size = 0;
+    for (i = 0; i != n_elements; ++i)
+      contents_size += request2size(sizes[i]);
+  }
+
+  size = contents_size + array_size;
+
+  /*
+     Allocate the aggregate chunk.  First disable direct-mmapping so
+     malloc won't use it, since we would not be able to later
+     free/realloc space internal to a segregated mmap region.
+  */
+  was_enabled = use_mmap(m);
+  disable_mmap(m);
+  mem = internal_malloc(m, size - CHUNK_OVERHEAD);
+  if (was_enabled)
+    enable_mmap(m);
+  if (mem == 0)
+    return 0;
+
+  if (PREACTION(m)) return 0;
+  p = mem2chunk(mem);
+  remainder_size = chunksize(p);
+
+  assert(!is_mmapped(p));
+
+  if (opts & 0x2) {       /* optionally clear the elements */
+    memset((size_t*)mem, 0, remainder_size - SIZE_T_SIZE - array_size);
+  }
+
+  /* If not provided, allocate the pointer array as final part of chunk */
+  if (marray == 0) {
+    size_t  array_chunk_size;
+    array_chunk = chunk_plus_offset(p, contents_size);
+    array_chunk_size = remainder_size - contents_size;
+    marray = (void**) (chunk2mem(array_chunk));
+    set_size_and_pinuse_of_inuse_chunk(m, array_chunk, array_chunk_size);
+    remainder_size = contents_size;
+  }
+
+  /* split out elements */
+  for (i = 0; ; ++i) {
+    marray[i] = chunk2mem(p);
+    if (i != n_elements-1) {
+      if (element_size != 0)
+	size = element_size;
+      else
+	size = request2size(sizes[i]);
+      remainder_size -= size;
+      set_size_and_pinuse_of_inuse_chunk(m, p, size);
+      p = chunk_plus_offset(p, size);
+    }
+    else { /* the final element absorbs any overallocation slop */
+      set_size_and_pinuse_of_inuse_chunk(m, p, remainder_size);
+      break;
+    }
+  }
+
+#if DEBUG
+  if (marray != chunks) {
+    /* final element must have exactly exhausted chunk */
+    if (element_size != 0) {
+      assert(remainder_size == element_size);
+    }
+    else {
+      assert(remainder_size == request2size(sizes[i]));
+    }
+    check_inuse_chunk(m, mem2chunk(marray));
+  }
+  for (i = 0; i != n_elements; ++i)
+    check_inuse_chunk(m, mem2chunk(marray[i]));
+
+#endif /* DEBUG */
+
+  POSTACTION(m);
+  return marray;
+}
+
+
+/* -------------------------- public routines ---------------------------- */
+
+#if !ONLY_MSPACES
+
+void* dlmalloc(size_t bytes) {
+  /*
+     Basic algorithm:
+     If a small request (< 256 bytes minus per-chunk overhead):
+       1. If one exists, use a remainderless chunk in associated smallbin.
+	  (Remainderless means that there are too few excess bytes to
+	  represent as a chunk.)
+       2. If it is big enough, use the dv chunk, which is normally the
+	  chunk adjacent to the one used for the most recent small request.
+       3. If one exists, split the smallest available chunk in a bin,
+	  saving remainder in dv.
+       4. If it is big enough, use the top chunk.
+       5. If available, get memory from system and use it
+     Otherwise, for a large request:
+       1. Find the smallest available binned chunk that fits, and use it
+	  if it is better fitting than dv chunk, splitting if necessary.
+       2. If better fitting than any binned chunk, use the dv chunk.
+       3. If it is big enough, use the top chunk.
+       4. If request size >= mmap threshold, try to directly mmap this chunk.
+       5. If available, get memory from system and use it
+
+     The ugly goto's here ensure that postaction occurs along all paths.
+  */
+
+#if USE_LOCKS
+  ensure_initialization(); /* initialize in sys_alloc if not using locks */
+#endif
+
+  if (!PREACTION(gm)) {
+    void* mem;
+    size_t nb;
+    if (bytes <= MAX_SMALL_REQUEST) {
+      bindex_t idx;
+      binmap_t smallbits;
+      nb = (bytes < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(bytes);
+      idx = small_index(nb);
+      smallbits = gm->smallmap >> idx;
+
+      if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */
+	mchunkptr b, p;
+	idx += ~smallbits & 1;       /* Uses next bin if idx empty */
+	b = smallbin_at(gm, idx);
+	p = b->fd;
+	assert(chunksize(p) == small_index2size(idx));
+	unlink_first_small_chunk(gm, b, p, idx);
+	set_inuse_and_pinuse(gm, p, small_index2size(idx));
+	mem = chunk2mem(p);
+	check_malloced_chunk(gm, mem, nb);
+	goto postaction;
+      }
+
+      else if (nb > gm->dvsize) {
+	if (smallbits != 0) { /* Use chunk in next nonempty smallbin */
+	  mchunkptr b, p, r;
+	  size_t rsize;
+	  bindex_t i;
+	  binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx));
+	  binmap_t leastbit = least_bit(leftbits);
+	  compute_bit2idx(leastbit, i);
+	  b = smallbin_at(gm, i);
+	  p = b->fd;
+	  assert(chunksize(p) == small_index2size(i));
+	  unlink_first_small_chunk(gm, b, p, i);
+	  rsize = small_index2size(i) - nb;
+	  /* Fit here cannot be remainderless if 4byte sizes */
+	  if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE)
+	    set_inuse_and_pinuse(gm, p, small_index2size(i));
+	  else {
+	    set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
+	    r = chunk_plus_offset(p, nb);
+	    set_size_and_pinuse_of_free_chunk(r, rsize);
+	    replace_dv(gm, r, rsize);
+	  }
+	  mem = chunk2mem(p);
+	  check_malloced_chunk(gm, mem, nb);
+	  goto postaction;
+	}
+
+	else if (gm->treemap != 0 && (mem = tmalloc_small(gm, nb)) != 0) {
+	  check_malloced_chunk(gm, mem, nb);
+	  goto postaction;
+	}
+      }
+    }
+    else if (bytes >= MAX_REQUEST)
+      nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */
+    else {
+      nb = pad_request(bytes);
+      if (gm->treemap != 0 && (mem = tmalloc_large(gm, nb)) != 0) {
+	check_malloced_chunk(gm, mem, nb);
+	goto postaction;
+      }
+    }
+
+    if (nb <= gm->dvsize) {
+      size_t rsize = gm->dvsize - nb;
+      mchunkptr p = gm->dv;
+      if (rsize >= MIN_CHUNK_SIZE) { /* split dv */
+	mchunkptr r = gm->dv = chunk_plus_offset(p, nb);
+	gm->dvsize = rsize;
+	set_size_and_pinuse_of_free_chunk(r, rsize);
+	set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
+      }
+      else { /* exhaust dv */
+	size_t dvs = gm->dvsize;
+	gm->dvsize = 0;
+	gm->dv = 0;
+	set_inuse_and_pinuse(gm, p, dvs);
+      }
+      mem = chunk2mem(p);
+      check_malloced_chunk(gm, mem, nb);
+      goto postaction;
+    }
+
+    else if (nb < gm->topsize) { /* Split top */
+      size_t rsize = gm->topsize -= nb;
+      mchunkptr p = gm->top;
+      mchunkptr r = gm->top = chunk_plus_offset(p, nb);
+      r->head = rsize | PINUSE_BIT;
+      set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
+      mem = chunk2mem(p);
+      check_top_chunk(gm, gm->top);
+      check_malloced_chunk(gm, mem, nb);
+      goto postaction;
+    }
+
+    mem = sys_alloc(gm, nb);
+
+  postaction:
+    POSTACTION(gm);
+    return mem;
+  }
+
+  return 0;
+}
+
+void dlfree(void* mem) {
+  /*
+     Consolidate freed chunks with preceding or succeeding bordering
+     free chunks, if they exist, and then place in a bin.  Intermixed
+     with special cases for top, dv, mmapped chunks, and usage errors.
+  */
+
+  if (mem != 0) {
+    mchunkptr p  = mem2chunk(mem);
+#if FOOTERS
+    mstate fm = get_mstate_for(p);
+    if (!ok_magic(fm)) {
+      USAGE_ERROR_ACTION(fm, p);
+      return;
+    }
+#else /* FOOTERS */
+#define fm gm
+#endif /* FOOTERS */
+    if (!PREACTION(fm)) {
+      check_inuse_chunk(fm, p);
+      if (RTCHECK(ok_address(fm, p) && ok_cinuse(p))) {
+	size_t psize = chunksize(p);
+	mchunkptr next = chunk_plus_offset(p, psize);
+	if (!pinuse(p)) {
+	  size_t prevsize = p->prev_foot;
+	  if ((prevsize & IS_MMAPPED_BIT) != 0) {
+	    prevsize &= ~IS_MMAPPED_BIT;
+	    psize += prevsize + MMAP_FOOT_PAD;
+	    if (CALL_MUNMAP((char*)p - prevsize, psize) == 0)
+	      fm->footprint -= psize;
+	    goto postaction;
+	  }
+	  else {
+	    mchunkptr prev = chunk_minus_offset(p, prevsize);
+	    psize += prevsize;
+	    p = prev;
+	    if (RTCHECK(ok_address(fm, prev))) { /* consolidate backward */
+	      if (p != fm->dv) {
+		unlink_chunk(fm, p, prevsize);
+	      }
+	      else if ((next->head & INUSE_BITS) == INUSE_BITS) {
+		fm->dvsize = psize;
+		set_free_with_pinuse(p, psize, next);
+		goto postaction;
+	      }
+	    }
+	    else
+	      goto erroraction;
+	  }
+	}
+
+	if (RTCHECK(ok_next(p, next) && ok_pinuse(next))) {
+	  if (!cinuse(next)) {  /* consolidate forward */
+	    if (next == fm->top) {
+	      size_t tsize = fm->topsize += psize;
+	      fm->top = p;
+	      p->head = tsize | PINUSE_BIT;
+	      if (p == fm->dv) {
+		fm->dv = 0;
+		fm->dvsize = 0;
+	      }
+	      if (should_trim(fm, tsize))
+		sys_trim(fm, 0);
+	      goto postaction;
+	    }
+	    else if (next == fm->dv) {
+	      size_t dsize = fm->dvsize += psize;
+	      fm->dv = p;
+	      set_size_and_pinuse_of_free_chunk(p, dsize);
+	      goto postaction;
+	    }
+	    else {
+	      size_t nsize = chunksize(next);
+	      psize += nsize;
+	      unlink_chunk(fm, next, nsize);
+	      set_size_and_pinuse_of_free_chunk(p, psize);
+	      if (p == fm->dv) {
+		fm->dvsize = psize;
+		goto postaction;
+	      }
+	    }
+	  }
+	  else
+	    set_free_with_pinuse(p, psize, next);
+
+	  if (is_small(psize)) {
+	    insert_small_chunk(fm, p, psize);
+	    check_free_chunk(fm, p);
+	  }
+	  else {
+	    tchunkptr tp = (tchunkptr)p;
+	    insert_large_chunk(fm, tp, psize);
+	    check_free_chunk(fm, p);
+	    if (--fm->release_checks == 0)
+	      release_unused_segments(fm);
+	  }
+	  goto postaction;
+	}
+      }
+    erroraction:
+      USAGE_ERROR_ACTION(fm, p);
+    postaction:
+      POSTACTION(fm);
+    }
+  }
+#if !FOOTERS
+#undef fm
+#endif /* FOOTERS */
+}
+
+void* dlcalloc(size_t n_elements, size_t elem_size) {
+  void* mem;
+  size_t req = 0;
+  if (n_elements != 0) {
+    req = n_elements * elem_size;
+    if (((n_elements | elem_size) & ~(size_t)0xffff) &&
+	(req / n_elements != elem_size))
+      req = MAX_SIZE_T; /* force downstream failure on overflow */
+  }
+  mem = dlmalloc(req);
+  if (mem != 0 && calloc_must_clear(mem2chunk(mem)))
+    memset(mem, 0, req);
+  return mem;
+}
+
+void* dlrealloc(void* oldmem, size_t bytes) {
+  if (oldmem == 0)
+    return dlmalloc(bytes);
+#ifdef REALLOC_ZERO_BYTES_FREES
+  if (bytes == 0) {
+    dlfree(oldmem);
+    return 0;
+  }
+#endif /* REALLOC_ZERO_BYTES_FREES */
+  else {
+#if ! FOOTERS
+    mstate m = gm;
+#else /* FOOTERS */
+    mstate m = get_mstate_for(mem2chunk(oldmem));
+    if (!ok_magic(m)) {
+      USAGE_ERROR_ACTION(m, oldmem);
+      return 0;
+    }
+#endif /* FOOTERS */
+    return internal_realloc(m, oldmem, bytes);
+  }
+}
+
+void* dlmemalign(size_t alignment, size_t bytes) {
+  return internal_memalign(gm, alignment, bytes);
+}
+
+void** dlindependent_calloc(size_t n_elements, size_t elem_size,
+				 void* chunks[]) {
+  size_t sz = elem_size; /* serves as 1-element array */
+  return ialloc(gm, n_elements, &sz, 3, chunks);
+}
+
+void** dlindependent_comalloc(size_t n_elements, size_t sizes[],
+				   void* chunks[]) {
+  return ialloc(gm, n_elements, sizes, 0, chunks);
+}
+
+void* dlvalloc(size_t bytes) {
+  size_t pagesz;
+  ensure_initialization();
+  pagesz = mparams.page_size;
+  return dlmemalign(pagesz, bytes);
+}
+
+void* dlpvalloc(size_t bytes) {
+  size_t pagesz;
+  ensure_initialization();
+  pagesz = mparams.page_size;
+  return dlmemalign(pagesz, (bytes + pagesz - SIZE_T_ONE) & ~(pagesz - SIZE_T_ONE));
+}
+
+int dlmalloc_trim(size_t pad) {
+  ensure_initialization();
+  int result = 0;
+  if (!PREACTION(gm)) {
+    result = sys_trim(gm, pad);
+    POSTACTION(gm);
+  }
+  return result;
+}
+
+size_t dlmalloc_footprint(void) {
+  return gm->footprint;
+}
+
+size_t dlmalloc_max_footprint(void) {
+  return gm->max_footprint;
+}
+
+#if !NO_MALLINFO
+struct mallinfo dlmallinfo(void) {
+  return internal_mallinfo(gm);
+}
+#endif /* NO_MALLINFO */
+
+void dlmalloc_stats() {
+  internal_malloc_stats(gm);
+}
+
+int dlmallopt(int param_number, int value) {
+  return change_mparam(param_number, value);
+}
+
+#endif /* !ONLY_MSPACES */
+
+size_t dlmalloc_usable_size(void* mem) {
+  if (mem != 0) {
+    mchunkptr p = mem2chunk(mem);
+    if (cinuse(p))
+      return chunksize(p) - overhead_for(p);
+  }
+  return 0;
+}
+
+/* ----------------------------- user mspaces ---------------------------- */
+
+#if MSPACES
+
+static mstate init_user_mstate(char* tbase, size_t tsize) {
+  size_t msize = pad_request(sizeof(struct malloc_state));
+  mchunkptr mn;
+  mchunkptr msp = align_as_chunk(tbase);
+  mstate m = (mstate)(chunk2mem(msp));
+  memset(m, 0, msize);
+  (void)INITIAL_LOCK(&m->mutex);
+  msp->head = (msize|PINUSE_BIT|CINUSE_BIT);
+  m->seg.base = m->least_addr = tbase;
+  m->seg.size = m->footprint = m->max_footprint = tsize;
+  m->magic = mparams.magic;
+  m->release_checks = MAX_RELEASE_CHECK_RATE;
+  m->mflags = mparams.default_mflags;
+  m->extp = 0;
+  m->exts = 0;
+  disable_contiguous(m);
+  init_bins(m);
+  mn = next_chunk(mem2chunk(m));
+  init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) - TOP_FOOT_SIZE);
+  check_top_chunk(m, m->top);
+  return m;
+}
+
+mspace create_mspace(size_t capacity, int locked) {
+  mstate m = 0;
+  size_t msize;
+  ensure_initialization();
+  msize = pad_request(sizeof(struct malloc_state));
+  if (capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) {
+    size_t rs = ((capacity == 0)? mparams.granularity :
+		 (capacity + TOP_FOOT_SIZE + msize));
+    size_t tsize = granularity_align(rs);
+    char* tbase = (char*)(CALL_MMAP(tsize));
+    if (tbase != CMFAIL) {
+      m = init_user_mstate(tbase, tsize);
+      m->seg.sflags = IS_MMAPPED_BIT;
+      set_lock(m, locked);
+    }
+  }
+  return (mspace)m;
+}
+
+mspace create_mspace_with_base(void* base, size_t capacity, int locked) {
+  mstate m = 0;
+  size_t msize;
+  ensure_initialization();
+  msize = pad_request(sizeof(struct malloc_state));
+  if (capacity > msize + TOP_FOOT_SIZE &&
+      capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) {
+    m = init_user_mstate((char*)base, capacity);
+    m->seg.sflags = EXTERN_BIT;
+    set_lock(m, locked);
+  }
+  return (mspace)m;
+}
+
+int mspace_mmap_large_chunks(mspace msp, int enable) {
+  int ret = 0;
+  mstate ms = (mstate)msp;
+  if (!PREACTION(ms)) {
+    if (use_mmap(ms))
+      ret = 1;
+    if (enable)
+      enable_mmap(ms);
+    else
+      disable_mmap(ms);
+    POSTACTION(ms);
+  }
+  return ret;
+}
+
+size_t destroy_mspace(mspace msp) {
+  size_t freed = 0;
+  mstate ms = (mstate)msp;
+  if (ok_magic(ms)) {
+    msegmentptr sp = &ms->seg;
+    while (sp != 0) {
+      char* base = sp->base;
+      size_t size = sp->size;
+      flag_t flag = sp->sflags;
+      sp = sp->next;
+      if ((flag & IS_MMAPPED_BIT) && !(flag & EXTERN_BIT) &&
+	  CALL_MUNMAP(base, size) == 0)
+	freed += size;
+    }
+  }
+  else {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+  return freed;
+}
+
+/*
+  mspace versions of routines are near-clones of the global
+  versions. This is not so nice but better than the alternatives.
+*/
+
+
+void* mspace_malloc(mspace msp, size_t bytes) {
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+    return 0;
+  }
+  if (!PREACTION(ms)) {
+    void* mem;
+    size_t nb;
+    if (bytes <= MAX_SMALL_REQUEST) {
+      bindex_t idx;
+      binmap_t smallbits;
+      nb = (bytes < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(bytes);
+      idx = small_index(nb);
+      smallbits = ms->smallmap >> idx;
+
+      if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */
+	mchunkptr b, p;
+	idx += ~smallbits & 1;       /* Uses next bin if idx empty */
+	b = smallbin_at(ms, idx);
+	p = b->fd;
+	assert(chunksize(p) == small_index2size(idx));
+	unlink_first_small_chunk(ms, b, p, idx);
+	set_inuse_and_pinuse(ms, p, small_index2size(idx));
+	mem = chunk2mem(p);
+	check_malloced_chunk(ms, mem, nb);
+	goto postaction;
+      }
+
+      else if (nb > ms->dvsize) {
+	if (smallbits != 0) { /* Use chunk in next nonempty smallbin */
+	  mchunkptr b, p, r;
+	  size_t rsize;
+	  bindex_t i;
+	  binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx));
+	  binmap_t leastbit = least_bit(leftbits);
+	  compute_bit2idx(leastbit, i);
+	  b = smallbin_at(ms, i);
+	  p = b->fd;
+	  assert(chunksize(p) == small_index2size(i));
+	  unlink_first_small_chunk(ms, b, p, i);
+	  rsize = small_index2size(i) - nb;
+	  /* Fit here cannot be remainderless if 4byte sizes */
+	  if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE)
+	    set_inuse_and_pinuse(ms, p, small_index2size(i));
+	  else {
+	    set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
+	    r = chunk_plus_offset(p, nb);
+	    set_size_and_pinuse_of_free_chunk(r, rsize);
+	    replace_dv(ms, r, rsize);
+	  }
+	  mem = chunk2mem(p);
+	  check_malloced_chunk(ms, mem, nb);
+	  goto postaction;
+	}
+
+	else if (ms->treemap != 0 && (mem = tmalloc_small(ms, nb)) != 0) {
+	  check_malloced_chunk(ms, mem, nb);
+	  goto postaction;
+	}
+      }
+    }
+    else if (bytes >= MAX_REQUEST)
+      nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */
+    else {
+      nb = pad_request(bytes);
+      if (ms->treemap != 0 && (mem = tmalloc_large(ms, nb)) != 0) {
+	check_malloced_chunk(ms, mem, nb);
+	goto postaction;
+      }
+    }
+
+    if (nb <= ms->dvsize) {
+      size_t rsize = ms->dvsize - nb;
+      mchunkptr p = ms->dv;
+      if (rsize >= MIN_CHUNK_SIZE) { /* split dv */
+	mchunkptr r = ms->dv = chunk_plus_offset(p, nb);
+	ms->dvsize = rsize;
+	set_size_and_pinuse_of_free_chunk(r, rsize);
+	set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
+      }
+      else { /* exhaust dv */
+	size_t dvs = ms->dvsize;
+	ms->dvsize = 0;
+	ms->dv = 0;
+	set_inuse_and_pinuse(ms, p, dvs);
+      }
+      mem = chunk2mem(p);
+      check_malloced_chunk(ms, mem, nb);
+      goto postaction;
+    }
+
+    else if (nb < ms->topsize) { /* Split top */
+      size_t rsize = ms->topsize -= nb;
+      mchunkptr p = ms->top;
+      mchunkptr r = ms->top = chunk_plus_offset(p, nb);
+      r->head = rsize | PINUSE_BIT;
+      set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
+      mem = chunk2mem(p);
+      check_top_chunk(ms, ms->top);
+      check_malloced_chunk(ms, mem, nb);
+      goto postaction;
+    }
+
+    mem = sys_alloc(ms, nb);
+
+  postaction:
+    POSTACTION(ms);
+    return mem;
+  }
+
+  return 0;
+}
+
+void mspace_free(mspace msp, void* mem) {
+  if (mem != 0) {
+    mchunkptr p  = mem2chunk(mem);
+#if FOOTERS
+    mstate fm = get_mstate_for(p);
+#else /* FOOTERS */
+    mstate fm = (mstate)msp;
+#endif /* FOOTERS */
+    if (!ok_magic(fm)) {
+      USAGE_ERROR_ACTION(fm, p);
+      return;
+    }
+    if (!PREACTION(fm)) {
+      check_inuse_chunk(fm, p);
+      if (RTCHECK(ok_address(fm, p) && ok_cinuse(p))) {
+	size_t psize = chunksize(p);
+	mchunkptr next = chunk_plus_offset(p, psize);
+	if (!pinuse(p)) {
+	  size_t prevsize = p->prev_foot;
+	  if ((prevsize & IS_MMAPPED_BIT) != 0) {
+	    prevsize &= ~IS_MMAPPED_BIT;
+	    psize += prevsize + MMAP_FOOT_PAD;
+	    if (CALL_MUNMAP((char*)p - prevsize, psize) == 0)
+	      fm->footprint -= psize;
+	    goto postaction;
+	  }
+	  else {
+	    mchunkptr prev = chunk_minus_offset(p, prevsize);
+	    psize += prevsize;
+	    p = prev;
+	    if (RTCHECK(ok_address(fm, prev))) { /* consolidate backward */
+	      if (p != fm->dv) {
+		unlink_chunk(fm, p, prevsize);
+	      }
+	      else if ((next->head & INUSE_BITS) == INUSE_BITS) {
+		fm->dvsize = psize;
+		set_free_with_pinuse(p, psize, next);
+		goto postaction;
+	      }
+	    }
+	    else
+	      goto erroraction;
+	  }
+	}
+
+	if (RTCHECK(ok_next(p, next) && ok_pinuse(next))) {
+	  if (!cinuse(next)) {  /* consolidate forward */
+	    if (next == fm->top) {
+	      size_t tsize = fm->topsize += psize;
+	      fm->top = p;
+	      p->head = tsize | PINUSE_BIT;
+	      if (p == fm->dv) {
+		fm->dv = 0;
+		fm->dvsize = 0;
+	      }
+	      if (should_trim(fm, tsize))
+		sys_trim(fm, 0);
+	      goto postaction;
+	    }
+	    else if (next == fm->dv) {
+	      size_t dsize = fm->dvsize += psize;
+	      fm->dv = p;
+	      set_size_and_pinuse_of_free_chunk(p, dsize);
+	      goto postaction;
+	    }
+	    else {
+	      size_t nsize = chunksize(next);
+	      psize += nsize;
+	      unlink_chunk(fm, next, nsize);
+	      set_size_and_pinuse_of_free_chunk(p, psize);
+	      if (p == fm->dv) {
+		fm->dvsize = psize;
+		goto postaction;
+	      }
+	    }
+	  }
+	  else
+	    set_free_with_pinuse(p, psize, next);
+
+	  if (is_small(psize)) {
+	    insert_small_chunk(fm, p, psize);
+	    check_free_chunk(fm, p);
+	  }
+	  else {
+	    tchunkptr tp = (tchunkptr)p;
+	    insert_large_chunk(fm, tp, psize);
+	    check_free_chunk(fm, p);
+	    if (--fm->release_checks == 0)
+	      release_unused_segments(fm);
+	  }
+	  goto postaction;
+	}
+      }
+    erroraction:
+      USAGE_ERROR_ACTION(fm, p);
+    postaction:
+      POSTACTION(fm);
+    }
+  }
+}
+
+void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size) {
+  void* mem;
+  size_t req = 0;
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+    return 0;
+  }
+  if (n_elements != 0) {
+    req = n_elements * elem_size;
+    if (((n_elements | elem_size) & ~(size_t)0xffff) &&
+	(req / n_elements != elem_size))
+      req = MAX_SIZE_T; /* force downstream failure on overflow */
+  }
+  mem = internal_malloc(ms, req);
+  if (mem != 0 && calloc_must_clear(mem2chunk(mem)))
+    memset(mem, 0, req);
+  return mem;
+}
+
+void* mspace_realloc(mspace msp, void* oldmem, size_t bytes) {
+  if (oldmem == 0)
+    return mspace_malloc(msp, bytes);
+#ifdef REALLOC_ZERO_BYTES_FREES
+  if (bytes == 0) {
+    mspace_free(msp, oldmem);
+    return 0;
+  }
+#endif /* REALLOC_ZERO_BYTES_FREES */
+  else {
+#if FOOTERS
+    mchunkptr p  = mem2chunk(oldmem);
+    mstate ms = get_mstate_for(p);
+#else /* FOOTERS */
+    mstate ms = (mstate)msp;
+#endif /* FOOTERS */
+    if (!ok_magic(ms)) {
+      USAGE_ERROR_ACTION(ms,ms);
+      return 0;
+    }
+    return internal_realloc(ms, oldmem, bytes);
+  }
+}
+
+void* mspace_memalign(mspace msp, size_t alignment, size_t bytes) {
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+    return 0;
+  }
+  return internal_memalign(ms, alignment, bytes);
+}
+
+void** mspace_independent_calloc(mspace msp, size_t n_elements,
+				 size_t elem_size, void* chunks[]) {
+  size_t sz = elem_size; /* serves as 1-element array */
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+    return 0;
+  }
+  return ialloc(ms, n_elements, &sz, 3, chunks);
+}
+
+void** mspace_independent_comalloc(mspace msp, size_t n_elements,
+				   size_t sizes[], void* chunks[]) {
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+    return 0;
+  }
+  return ialloc(ms, n_elements, sizes, 0, chunks);
+}
+
+int mspace_trim(mspace msp, size_t pad) {
+  int result = 0;
+  mstate ms = (mstate)msp;
+  if (ok_magic(ms)) {
+    if (!PREACTION(ms)) {
+      result = sys_trim(ms, pad);
+      POSTACTION(ms);
+    }
+  }
+  else {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+  return result;
+}
+
+void mspace_malloc_stats(mspace msp) {
+  mstate ms = (mstate)msp;
+  if (ok_magic(ms)) {
+    internal_malloc_stats(ms);
+  }
+  else {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+}
+
+size_t mspace_footprint(mspace msp) {
+  size_t result = 0;
+  mstate ms = (mstate)msp;
+  if (ok_magic(ms)) {
+    result = ms->footprint;
+  }
+  else {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+  return result;
+}
+
+
+size_t mspace_max_footprint(mspace msp) {
+  size_t result = 0;
+  mstate ms = (mstate)msp;
+  if (ok_magic(ms)) {
+    result = ms->max_footprint;
+  }
+  else {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+  return result;
+}
+
+
+#if !NO_MALLINFO
+struct mallinfo mspace_mallinfo(mspace msp) {
+  mstate ms = (mstate)msp;
+  if (!ok_magic(ms)) {
+    USAGE_ERROR_ACTION(ms,ms);
+  }
+  return internal_mallinfo(ms);
+}
+#endif /* NO_MALLINFO */
+
+size_t mspace_usable_size(void* mem) {
+  if (mem != 0) {
+    mchunkptr p = mem2chunk(mem);
+    if (cinuse(p))
+      return chunksize(p) - overhead_for(p);
+  }
+  return 0;
+}
+
+int mspace_mallopt(int param_number, int value) {
+  return change_mparam(param_number, value);
+}
+
+#endif /* MSPACES */
+
+/* -------------------- Alternative MORECORE functions ------------------- */
+
+/*
+  Guidelines for creating a custom version of MORECORE:
+
+  * For best performance, MORECORE should allocate in multiples of pagesize.
+  * MORECORE may allocate more memory than requested. (Or even less,
+      but this will usually result in a malloc failure.)
+  * MORECORE must not allocate memory when given argument zero, but
+      instead return one past the end address of memory from previous
+      nonzero call.
+  * For best performance, consecutive calls to MORECORE with positive
+      arguments should return increasing addresses, indicating that
+      space has been contiguously extended.
+  * Even though consecutive calls to MORECORE need not return contiguous
+      addresses, it must be OK for malloc'ed chunks to span multiple
+      regions in those cases where they do happen to be contiguous.
+  * MORECORE need not handle negative arguments -- it may instead
+      just return MFAIL when given negative arguments.
+      Negative arguments are always multiples of pagesize. MORECORE
+      must not misinterpret negative args as large positive unsigned
+      args. You can suppress all such calls from even occurring by defining
+      MORECORE_CANNOT_TRIM,
+
+  As an example alternative MORECORE, here is a custom allocator
+  kindly contributed for pre-OSX macOS.  It uses virtually but not
+  necessarily physically contiguous non-paged memory (locked in,
+  present and won't get swapped out).  You can use it by uncommenting
+  this section, adding some #includes, and setting up the appropriate
+  defines above:
+
+      #define MORECORE osMoreCore
+
+  There is also a shutdown routine that should somehow be called for
+  cleanup upon program exit.
+
+  #define MAX_POOL_ENTRIES 100
+  #define MINIMUM_MORECORE_SIZE  (64 * 1024U)
+  static int next_os_pool;
+  void *our_os_pools[MAX_POOL_ENTRIES];
+
+  void *osMoreCore(int size)
+  {
+    void *ptr = 0;
+    static void *sbrk_top = 0;
+
+    if (size > 0)
+    {
+      if (size < MINIMUM_MORECORE_SIZE)
+	 size = MINIMUM_MORECORE_SIZE;
+      if (CurrentExecutionLevel() == kTaskLevel)
+	 ptr = PoolAllocateResident(size + RM_PAGE_SIZE, 0);
+      if (ptr == 0)
+      {
+	return (void *) MFAIL;
+      }
+      // save ptrs so they can be freed during cleanup
+      our_os_pools[next_os_pool] = ptr;
+      next_os_pool++;
+      ptr = (void *) ((((size_t) ptr) + RM_PAGE_MASK) & ~RM_PAGE_MASK);
+      sbrk_top = (char *) ptr + size;
+      return ptr;
+    }
+    else if (size < 0)
+    {
+      // we don't currently support shrink behavior
+      return (void *) MFAIL;
+    }
+    else
+    {
+      return sbrk_top;
+    }
+  }
+
+  // cleanup any allocated memory pools
+  // called as last thing before shutting down driver
+
+  void osCleanupMem(void)
+  {
+    void **ptr;
+
+    for (ptr = our_os_pools; ptr < &our_os_pools[MAX_POOL_ENTRIES]; ptr++)
+      if (*ptr)
+      {
+	 PoolDeallocate(*ptr);
+	 *ptr = 0;
+      }
+  }
+
+*/
+
+
+/* -----------------------------------------------------------------------
+History:
+    V2.8.4 (not yet released)
+      * Add mspace_mmap_large_chunks; thanks to Jean Brouwers
+      * Fix insufficient sys_alloc padding when using 16byte alignment
+      * Fix bad error check in mspace_footprint
+      * Adaptations for ptmalloc, courtesy of Wolfram Gloger.
+      * Reentrant spin locks, courtesy of Earl Chew and others
+      * Win32 improvements, courtesy of Niall Douglas and Earl Chew
+      * Add NO_SEGMENT_TRAVERSAL and MAX_RELEASE_CHECK_RATE options
+      * Extension hook in malloc_state
+      * Various small adjustments to reduce warnings on some compilers
+      * Various configuration extensions/changes for more platforms. Thanks
+	 to all who contributed these.
+
+    V2.8.3 Thu Sep 22 11:16:32 2005  Doug Lea  (dl at gee)
+      * Add max_footprint functions
+      * Ensure all appropriate literals are size_t
+      * Fix conditional compilation problem for some #define settings
+      * Avoid concatenating segments with the one provided
+	in create_mspace_with_base
+      * Rename some variables to avoid compiler shadowing warnings
+      * Use explicit lock initialization.
+      * Better handling of sbrk interference.
+      * Simplify and fix segment insertion, trimming and mspace_destroy
+      * Reinstate REALLOC_ZERO_BYTES_FREES option from 2.7.x
+      * Thanks especially to Dennis Flanagan for help on these.
+
+    V2.8.2 Sun Jun 12 16:01:10 2005  Doug Lea  (dl at gee)
+      * Fix memalign brace error.
+
+    V2.8.1 Wed Jun  8 16:11:46 2005  Doug Lea  (dl at gee)
+      * Fix improper #endif nesting in C++
+      * Add explicit casts needed for C++
+
+    V2.8.0 Mon May 30 14:09:02 2005  Doug Lea  (dl at gee)
+      * Use trees for large bins
+      * Support mspaces
+      * Use segments to unify sbrk-based and mmap-based system allocation,
+	removing need for emulation on most platforms without sbrk.
+      * Default safety checks
+      * Optional footer checks. Thanks to William Robertson for the idea.
+      * Internal code refactoring
+      * Incorporate suggestions and platform-specific changes.
+	Thanks to Dennis Flanagan, Colin Plumb, Niall Douglas,
+	Aaron Bachmann,  Emery Berger, and others.
+      * Speed up non-fastbin processing enough to remove fastbins.
+      * Remove useless cfree() to avoid conflicts with other apps.
+      * Remove internal memcpy, memset. Compilers handle builtins better.
+      * Remove some options that no one ever used and rename others.
+
+    V2.7.2 Sat Aug 17 09:07:30 2002  Doug Lea  (dl at gee)
+      * Fix malloc_state bitmap array misdeclaration
+
+    V2.7.1 Thu Jul 25 10:58:03 2002  Doug Lea  (dl at gee)
+      * Allow tuning of FIRST_SORTED_BIN_SIZE
+      * Use PTR_UINT as type for all ptr->int casts. Thanks to John Belmonte.
+      * Better detection and support for non-contiguousness of MORECORE.
+	Thanks to Andreas Mueller, Conal Walsh, and Wolfram Gloger
+      * Bypass most of malloc if no frees. Thanks To Emery Berger.
+      * Fix freeing of old top non-contiguous chunk im sysmalloc.
+      * Raised default trim and map thresholds to 256K.
+      * Fix mmap-related #defines. Thanks to Lubos Lunak.
+      * Fix copy macros; added LACKS_FCNTL_H. Thanks to Neal Walfield.
+      * Branch-free bin calculation
+      * Default trim and mmap thresholds now 256K.
+
+    V2.7.0 Sun Mar 11 14:14:06 2001  Doug Lea  (dl at gee)
+      * Introduce independent_comalloc and independent_calloc.
+	Thanks to Michael Pachos for motivation and help.
+      * Make optional .h file available
+      * Allow > 2GB requests on 32bit systems.
+      * new WIN32 sbrk, mmap, munmap, lock code from <Walter@GeNeSys-e.de>.
+	Thanks also to Andreas Mueller <a.mueller at paradatec.de>,
+	and Anonymous.
+      * Allow override of MALLOC_ALIGNMENT (Thanks to Ruud Waij for
+	helping test this.)
+      * memalign: check alignment arg
+      * realloc: don't try to shift chunks backwards, since this
+	leads to  more fragmentation in some programs and doesn't
+	seem to help in any others.
+      * Collect all cases in malloc requiring system memory into sysmalloc
+      * Use mmap as backup to sbrk
+      * Place all internal state in malloc_state
+      * Introduce fastbins (although similar to 2.5.1)
+      * Many minor tunings and cosmetic improvements
+      * Introduce USE_PUBLIC_MALLOC_WRAPPERS, USE_MALLOC_LOCK
+      * Introduce MALLOC_FAILURE_ACTION, MORECORE_CONTIGUOUS
+	Thanks to Tony E. Bennett <tbennett@nvidia.com> and others.
+      * Include errno.h to support default failure action.
+
+    V2.6.6 Sun Dec  5 07:42:19 1999  Doug Lea  (dl at gee)
+      * return null for negative arguments
+      * Added Several WIN32 cleanups from Martin C. Fong <mcfong at yahoo.com>
+	 * Add 'LACKS_SYS_PARAM_H' for those systems without 'sys/param.h'
+	  (e.g. WIN32 platforms)
+	 * Cleanup header file inclusion for WIN32 platforms
+	 * Cleanup code to avoid Microsoft Visual C++ compiler complaints
+	 * Add 'USE_DL_PREFIX' to quickly allow co-existence with existing
+	   memory allocation routines
+	 * Set 'malloc_getpagesize' for WIN32 platforms (needs more work)
+	 * Use 'assert' rather than 'ASSERT' in WIN32 code to conform to
+	   usage of 'assert' in non-WIN32 code
+	 * Improve WIN32 'sbrk()' emulation's 'findRegion()' routine to
+	   avoid infinite loop
+      * Always call 'fREe()' rather than 'free()'
+
+    V2.6.5 Wed Jun 17 15:57:31 1998  Doug Lea  (dl at gee)
+      * Fixed ordering problem with boundary-stamping
+
+    V2.6.3 Sun May 19 08:17:58 1996  Doug Lea  (dl at gee)
+      * Added pvalloc, as recommended by H.J. Liu
+      * Added 64bit pointer support mainly from Wolfram Gloger
+      * Added anonymously donated WIN32 sbrk emulation
+      * Malloc, calloc, getpagesize: add optimizations from Raymond Nijssen
+      * malloc_extend_top: fix mask error that caused wastage after
+	foreign sbrks
+      * Add linux mremap support code from HJ Liu
+
+    V2.6.2 Tue Dec  5 06:52:55 1995  Doug Lea  (dl at gee)
+      * Integrated most documentation with the code.
+      * Add support for mmap, with help from
+	Wolfram Gloger (Gloger@lrz.uni-muenchen.de).
+      * Use last_remainder in more cases.
+      * Pack bins using idea from  colin@nyx10.cs.du.edu
+      * Use ordered bins instead of best-fit threshold
+      * Eliminate block-local decls to simplify tracing and debugging.
+      * Support another case of realloc via move into top
+      * Fix error occurring when initial sbrk_base not word-aligned.
+      * Rely on page size for units instead of SBRK_UNIT to
+	avoid surprises about sbrk alignment conventions.
+      * Add mallinfo, mallopt. Thanks to Raymond Nijssen
+	(raymond@es.ele.tue.nl) for the suggestion.
+      * Add `pad' argument to malloc_trim and top_pad mallopt parameter.
+      * More precautions for cases where other routines call sbrk,
+	courtesy of Wolfram Gloger (Gloger@lrz.uni-muenchen.de).
+      * Added macros etc., allowing use in linux libc from
+	H.J. Lu (hjl@gnu.ai.mit.edu)
+      * Inverted this history list
+
+    V2.6.1 Sat Dec  2 14:10:57 1995  Doug Lea  (dl at gee)
+      * Re-tuned and fixed to behave more nicely with V2.6.0 changes.
+      * Removed all preallocation code since under current scheme
+	the work required to undo bad preallocations exceeds
+	the work saved in good cases for most test programs.
+      * No longer use return list or unconsolidated bins since
+	no scheme using them consistently outperforms those that don't
+	given above changes.
+      * Use best fit for very large chunks to prevent some worst-cases.
+      * Added some support for debugging
+
+    V2.6.0 Sat Nov  4 07:05:23 1995  Doug Lea  (dl at gee)
+      * Removed footers when chunks are in use. Thanks to
+	Paul Wilson (wilson@cs.texas.edu) for the suggestion.
+
+    V2.5.4 Wed Nov  1 07:54:51 1995  Doug Lea  (dl at gee)
+      * Added malloc_trim, with help from Wolfram Gloger
+	(wmglo@Dent.MED.Uni-Muenchen.DE).
+
+    V2.5.3 Tue Apr 26 10:16:01 1994  Doug Lea  (dl at g)
+
+    V2.5.2 Tue Apr  5 16:20:40 1994  Doug Lea  (dl at g)
+      * realloc: try to expand in both directions
+      * malloc: swap order of clean-bin strategy;
+      * realloc: only conditionally expand backwards
+      * Try not to scavenge used bins
+      * Use bin counts as a guide to preallocation
+      * Occasionally bin return list chunks in first scan
+      * Add a few optimizations from colin@nyx10.cs.du.edu
+
+    V2.5.1 Sat Aug 14 15:40:43 1993  Doug Lea  (dl at g)
+      * faster bin computation & slightly different binning
+      * merged all consolidations to one part of malloc proper
+	 (eliminating old malloc_find_space & malloc_clean_bin)
+      * Scan 2 returns chunks (not just 1)
+      * Propagate failure in realloc if malloc returns 0
+      * Add stuff to allow compilation on non-ANSI compilers
+	  from kpv@research.att.com
+
+    V2.5 Sat Aug  7 07:41:59 1993  Doug Lea  (dl at g.oswego.edu)
+      * removed potential for odd address access in prev_chunk
+      * removed dependency on getpagesize.h
+      * misc cosmetics and a bit more internal documentation
+      * anticosmetics: mangled names in macros to evade debugger strangeness
+      * tested on sparc, hp-700, dec-mips, rs6000
+	  with gcc & native cc (hp, dec only) allowing
+	  Detlefs & Zorn comparison study (in SIGPLAN Notices.)
+
+    Trial version Fri Aug 28 13:14:29 1992  Doug Lea  (dl at g.oswego.edu)
+      * Based loosely on libg++-1.2X malloc. (It retains some of the overall
+	 structure of old version,  but most details differ.)
+
+*/
diff --git a/lib/nedmalloc/nedmalloc.c b/lib/nedmalloc/nedmalloc.c
new file mode 100644
index 0000000..ff5c82c
--- /dev/null
+++ b/lib/nedmalloc/nedmalloc.c
@@ -0,0 +1,954 @@
+/* Alternative malloc implementation for multiple threads without
+lock contention based on dlmalloc. (C) 2005-2006 Niall Douglas
+
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+#ifdef _MSC_VER
+/* Enable full aliasing on MSVC */
+/*#pragma optimize("a", on)*/
+#endif
+
+/*#define FULLSANITYCHECKS*/
+
+#include "nedmalloc.h"
+#if defined(WIN32)
+ #include <malloc.h>
+#endif
+#define MSPACES 1
+#define ONLY_MSPACES 1
+#ifndef USE_LOCKS
+ #define USE_LOCKS 1
+#endif
+#define FOOTERS 1           /* Need to enable footers so frees lock the right mspace */
+#undef DEBUG				/* dlmalloc wants DEBUG either 0 or 1 */
+#ifdef _DEBUG
+ #define DEBUG 1
+#else
+ #define DEBUG 0
+#endif
+#ifdef NDEBUG               /* Disable assert checking on release builds */
+ #undef DEBUG
+#endif
+/* The default of 64Kb means we spend too much time kernel-side */
+#ifndef DEFAULT_GRANULARITY
+#define DEFAULT_GRANULARITY (1*1024*1024)
+#endif
+/*#define USE_SPIN_LOCKS 0*/
+
+
+/*#define FORCEINLINE*/
+#include "malloc.c.h"
+#ifdef NDEBUG               /* Disable assert checking on release builds */
+ #undef DEBUG
+#endif
+
+/* The maximum concurrent threads in a pool possible */
+#ifndef MAXTHREADSINPOOL
+#define MAXTHREADSINPOOL 16
+#endif
+/* The maximum number of threadcaches which can be allocated */
+#ifndef THREADCACHEMAXCACHES
+#define THREADCACHEMAXCACHES 256
+#endif
+/* The maximum size to be allocated from the thread cache */
+#ifndef THREADCACHEMAX
+#define THREADCACHEMAX 8192
+#endif
+#if 0
+/* The number of cache entries for finer grained bins. This is (topbitpos(THREADCACHEMAX)-4)*2 */
+#define THREADCACHEMAXBINS ((13-4)*2)
+#else
+/* The number of cache entries. This is (topbitpos(THREADCACHEMAX)-4) */
+#define THREADCACHEMAXBINS (13-4)
+#endif
+/* Point at which the free space in a thread cache is garbage collected */
+#ifndef THREADCACHEMAXFREESPACE
+#define THREADCACHEMAXFREESPACE (512*1024)
+#endif
+
+
+#ifdef WIN32
+ #define TLSVAR			DWORD
+ #define TLSALLOC(k)	(*(k)=TlsAlloc(), TLS_OUT_OF_INDEXES==*(k))
+ #define TLSFREE(k)		(!TlsFree(k))
+ #define TLSGET(k)		TlsGetValue(k)
+ #define TLSSET(k, a)	(!TlsSetValue(k, a))
+ #ifdef DEBUG
+static LPVOID ChkedTlsGetValue(DWORD idx)
+{
+	LPVOID ret=TlsGetValue(idx);
+	assert(S_OK==GetLastError());
+	return ret;
+}
+  #undef TLSGET
+  #define TLSGET(k) ChkedTlsGetValue(k)
+ #endif
+#else
+ #define TLSVAR			pthread_key_t
+ #define TLSALLOC(k)	pthread_key_create(k, 0)
+ #define TLSFREE(k)		pthread_key_delete(k)
+ #define TLSGET(k)		pthread_getspecific(k)
+ #define TLSSET(k, a)	pthread_setspecific(k, a)
+#endif
+
+#if 0
+/* Only enable if testing with valgrind. Causes misoperation */
+#define mspace_malloc(p, s) malloc(s)
+#define mspace_realloc(p, m, s) realloc(m, s)
+#define mspace_calloc(p, n, s) calloc(n, s)
+#define mspace_free(p, m) free(m)
+#endif
+
+
+#if defined(__cplusplus)
+#if !defined(NO_NED_NAMESPACE)
+namespace nedalloc {
+#else
+extern "C" {
+#endif
+#endif
+
+size_t nedblksize(void *mem) THROWSPEC
+{
+#if 0
+	/* Only enable if testing with valgrind. Causes misoperation */
+	return THREADCACHEMAX;
+#else
+	if(mem)
+	{
+		mchunkptr p=mem2chunk(mem);
+		assert(cinuse(p));	/* If this fails, someone tried to free a block twice */
+		if(cinuse(p))
+			return chunksize(p)-overhead_for(p);
+	}
+	return 0;
+#endif
+}
+
+void nedsetvalue(void *v) THROWSPEC					{ nedpsetvalue(0, v); }
+void * nedmalloc(size_t size) THROWSPEC				{ return nedpmalloc(0, size); }
+void * nedcalloc(size_t no, size_t size) THROWSPEC	{ return nedpcalloc(0, no, size); }
+void * nedrealloc(void *mem, size_t size) THROWSPEC	{ return nedprealloc(0, mem, size); }
+void   nedfree(void *mem) THROWSPEC					{ nedpfree(0, mem); }
+void * nedmemalign(size_t alignment, size_t bytes) THROWSPEC { return nedpmemalign(0, alignment, bytes); }
+int    nedposix_memalign(void** p, size_t alignment, size_t bytes) THROWSPEC { *p = nedmemalign(alignment, bytes); return 0; }
+#if !NO_MALLINFO
+struct mallinfo nedmallinfo(void) THROWSPEC			{ return nedpmallinfo(0); }
+#endif
+int    nedmallopt(int parno, int value) THROWSPEC	{ return nedpmallopt(0, parno, value); }
+int    nedmalloc_trim(size_t pad) THROWSPEC			{ return nedpmalloc_trim(0, pad); }
+void   nedmalloc_stats(void) THROWSPEC					{ nedpmalloc_stats(0); }
+size_t nedmalloc_footprint(void) THROWSPEC				{ return nedpmalloc_footprint(0); }
+void **nedindependent_calloc(size_t elemsno, size_t elemsize, void **chunks) THROWSPEC	{ return nedpindependent_calloc(0, elemsno, elemsize, chunks); }
+void **nedindependent_comalloc(size_t elems, size_t *sizes, void **chunks) THROWSPEC	{ return nedpindependent_comalloc(0, elems, sizes, chunks); }
+
+struct threadcacheblk_t;
+typedef struct threadcacheblk_t threadcacheblk;
+struct threadcacheblk_t
+{	/* Keep less than 16 bytes on 32 bit systems and 32 bytes on 64 bit systems */
+#ifdef FULLSANITYCHECKS
+	unsigned int magic;
+#endif
+	unsigned int lastUsed, size;
+	threadcacheblk *next, *prev;
+};
+typedef struct threadcache_t
+{
+#ifdef FULLSANITYCHECKS
+	unsigned int magic1;
+#endif
+	int mymspace;						/* Last mspace entry this thread used */
+	long threadid;
+	unsigned int mallocs, frees, successes;
+	size_t freeInCache;					/* How much free space is stored in this cache */
+	threadcacheblk *bins[(THREADCACHEMAXBINS+1)*2];
+#ifdef FULLSANITYCHECKS
+	unsigned int magic2;
+#endif
+} threadcache;
+struct nedpool_t
+{
+	MLOCK_T mutex;
+	void *uservalue;
+	int threads;						/* Max entries in m to use */
+	threadcache *caches[THREADCACHEMAXCACHES];
+	TLSVAR mycache;						/* Thread cache for this thread. 0 for unset, negative for use mspace-1 directly, otherwise is cache-1 */
+	mstate m[MAXTHREADSINPOOL+1];		/* mspace entries for this pool */
+};
+static nedpool syspool;
+
+static FORCEINLINE unsigned int size2binidx(size_t _size) THROWSPEC
+{	/* 8=1000	16=10000	20=10100	24=11000	32=100000	48=110000	4096=1000000000000 */
+	unsigned int topbit, size=(unsigned int)(_size>>4);
+	/* 16=1		20=1	24=1	32=10	48=11	64=100	96=110	128=1000	4096=100000000 */
+
+#if defined(__GNUC__)
+	topbit = sizeof(size)*__CHAR_BIT__ - 1 - __builtin_clz(size);
+#elif defined(_MSC_VER) && _MSC_VER>=1300
+	{
+	    unsigned long bsrTopBit;
+
+	    _BitScanReverse(&bsrTopBit, size);
+
+	    topbit = bsrTopBit;
+	}
+#else
+#if 0
+	union {
+		unsigned asInt[2];
+		double asDouble;
+	};
+	int n;
+
+	asDouble = (double)size + 0.5;
+	topbit = (asInt[!FOX_BIGENDIAN] >> 20) - 1023;
+#else
+	{
+		unsigned int x=size;
+		x = x | (x >> 1);
+		x = x | (x >> 2);
+		x = x | (x >> 4);
+		x = x | (x >> 8);
+		x = x | (x >>16);
+		x = ~x;
+		x = x - ((x >> 1) & 0x55555555);
+		x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+		x = (x + (x >> 4)) & 0x0F0F0F0F;
+		x = x + (x << 8);
+		x = x + (x << 16);
+		topbit=31 - (x >> 24);
+	}
+#endif
+#endif
+	return topbit;
+}
+
+
+#ifdef FULLSANITYCHECKS
+static void tcsanitycheck(threadcacheblk **ptr) THROWSPEC
+{
+	assert((ptr[0] && ptr[1]) || (!ptr[0] && !ptr[1]));
+	if(ptr[0] && ptr[1])
+	{
+		assert(nedblksize(ptr[0])>=sizeof(threadcacheblk));
+		assert(nedblksize(ptr[1])>=sizeof(threadcacheblk));
+		assert(*(unsigned int *) "NEDN"==ptr[0]->magic);
+		assert(*(unsigned int *) "NEDN"==ptr[1]->magic);
+		assert(!ptr[0]->prev);
+		assert(!ptr[1]->next);
+		if(ptr[0]==ptr[1])
+		{
+			assert(!ptr[0]->next);
+			assert(!ptr[1]->prev);
+		}
+	}
+}
+static void tcfullsanitycheck(threadcache *tc) THROWSPEC
+{
+	threadcacheblk **tcbptr=tc->bins;
+	int n;
+	for(n=0; n<=THREADCACHEMAXBINS; n++, tcbptr+=2)
+	{
+		threadcacheblk *b, *ob=0;
+		tcsanitycheck(tcbptr);
+		for(b=tcbptr[0]; b; ob=b, b=b->next)
+		{
+			assert(*(unsigned int *) "NEDN"==b->magic);
+			assert(!ob || ob->next==b);
+			assert(!ob || b->prev==ob);
+		}
+	}
+}
+#endif
+
+static NOINLINE void RemoveCacheEntries(nedpool *p, threadcache *tc, unsigned int age) THROWSPEC
+{
+#ifdef FULLSANITYCHECKS
+	tcfullsanitycheck(tc);
+#endif
+	if(tc->freeInCache)
+	{
+		threadcacheblk **tcbptr=tc->bins;
+		int n;
+		for(n=0; n<=THREADCACHEMAXBINS; n++, tcbptr+=2)
+		{
+			threadcacheblk **tcb=tcbptr+1;		/* come from oldest end of list */
+			/*tcsanitycheck(tcbptr);*/
+			for(; *tcb && tc->frees-(*tcb)->lastUsed>=age; )
+			{
+				threadcacheblk *f=*tcb;
+				size_t blksize=f->size; /*nedblksize(f);*/
+				assert(blksize<=nedblksize(f));
+				assert(blksize);
+#ifdef FULLSANITYCHECKS
+				assert(*(unsigned int *) "NEDN"==(*tcb)->magic);
+#endif
+				*tcb=(*tcb)->prev;
+				if(*tcb)
+					(*tcb)->next=0;
+				else
+					*tcbptr=0;
+				tc->freeInCache-=blksize;
+				assert((long) tc->freeInCache>=0);
+				mspace_free(0, f);
+				/*tcsanitycheck(tcbptr);*/
+			}
+		}
+	}
+#ifdef FULLSANITYCHECKS
+	tcfullsanitycheck(tc);
+#endif
+}
+static void DestroyCaches(nedpool *p) THROWSPEC
+{
+	if(p->caches)
+	{
+		threadcache *tc;
+		int n;
+		for(n=0; n<THREADCACHEMAXCACHES; n++)
+		{
+			if((tc=p->caches[n]))
+			{
+				tc->frees++;
+				RemoveCacheEntries(p, tc, 0);
+				assert(!tc->freeInCache);
+				tc->mymspace=-1;
+				tc->threadid=0;
+				mspace_free(0, tc);
+				p->caches[n]=0;
+			}
+		}
+	}
+}
+
+static NOINLINE threadcache *AllocCache(nedpool *p) THROWSPEC
+{
+	threadcache *tc=0;
+	int n, end;
+	ACQUIRE_LOCK(&p->mutex);
+	for(n=0; n<THREADCACHEMAXCACHES && p->caches[n]; n++);
+	if(THREADCACHEMAXCACHES==n)
+	{	/* List exhausted, so disable for this thread */
+		RELEASE_LOCK(&p->mutex);
+		return 0;
+	}
+	tc=p->caches[n]=(threadcache *) mspace_calloc(p->m[0], 1, sizeof(threadcache));
+	if(!tc)
+	{
+		RELEASE_LOCK(&p->mutex);
+		return 0;
+	}
+#ifdef FULLSANITYCHECKS
+	tc->magic1=*(unsigned int *)"NEDMALC1";
+	tc->magic2=*(unsigned int *)"NEDMALC2";
+#endif
+	tc->threadid=(long)(size_t)CURRENT_THREAD;
+	for(end=0; p->m[end]; end++);
+	tc->mymspace=tc->threadid % end;
+	RELEASE_LOCK(&p->mutex);
+	if(TLSSET(p->mycache, (void *)(size_t)(n+1))) abort();
+	return tc;
+}
+
+static void *threadcache_malloc(nedpool *p, threadcache *tc, size_t *size) THROWSPEC
+{
+	void *ret=0;
+	unsigned int bestsize;
+	unsigned int idx=size2binidx(*size);
+	size_t blksize=0;
+	threadcacheblk *blk, **binsptr;
+#ifdef FULLSANITYCHECKS
+	tcfullsanitycheck(tc);
+#endif
+	/* Calculate best fit bin size */
+	bestsize=1<<(idx+4);
+#if 0
+	/* Finer grained bin fit */
+	idx<<=1;
+	if(*size>bestsize)
+	{
+		idx++;
+		bestsize+=bestsize>>1;
+	}
+	if(*size>bestsize)
+	{
+		idx++;
+		bestsize=1<<(4+(idx>>1));
+	}
+#else
+	if(*size>bestsize)
+	{
+		idx++;
+		bestsize<<=1;
+	}
+#endif
+	assert(bestsize>=*size);
+	if(*size<bestsize) *size=bestsize;
+	assert(*size<=THREADCACHEMAX);
+	assert(idx<=THREADCACHEMAXBINS);
+	binsptr=&tc->bins[idx*2];
+	/* Try to match close, but move up a bin if necessary */
+	blk=*binsptr;
+	if(!blk || blk->size<*size)
+	{	/* Bump it up a bin */
+		if(idx<THREADCACHEMAXBINS)
+		{
+			idx++;
+			binsptr+=2;
+			blk=*binsptr;
+		}
+	}
+	if(blk)
+	{
+		blksize=blk->size; /*nedblksize(blk);*/
+		assert(nedblksize(blk)>=blksize);
+		assert(blksize>=*size);
+		if(blk->next)
+			blk->next->prev=0;
+		*binsptr=blk->next;
+		if(!*binsptr)
+			binsptr[1]=0;
+#ifdef FULLSANITYCHECKS
+		blk->magic=0;
+#endif
+		assert(binsptr[0]!=blk && binsptr[1]!=blk);
+		assert(nedblksize(blk)>=sizeof(threadcacheblk) && nedblksize(blk)<=THREADCACHEMAX+CHUNK_OVERHEAD);
+		/*printf("malloc: %p, %p, %p, %lu\n", p, tc, blk, (long) size);*/
+		ret=(void *) blk;
+	}
+	++tc->mallocs;
+	if(ret)
+	{
+		assert(blksize>=*size);
+		++tc->successes;
+		tc->freeInCache-=blksize;
+		assert((long) tc->freeInCache>=0);
+	}
+#if defined(DEBUG) && 0
+	if(!(tc->mallocs & 0xfff))
+	{
+		printf("*** threadcache=%u, mallocs=%u (%f), free=%u (%f), freeInCache=%u\n", (unsigned int) tc->threadid, tc->mallocs,
+			(float) tc->successes/tc->mallocs, tc->frees, (float) tc->successes/tc->frees, (unsigned int) tc->freeInCache);
+	}
+#endif
+#ifdef FULLSANITYCHECKS
+	tcfullsanitycheck(tc);
+#endif
+	return ret;
+}
+static NOINLINE void ReleaseFreeInCache(nedpool *p, threadcache *tc, int mymspace) THROWSPEC
+{
+	unsigned int age=THREADCACHEMAXFREESPACE/8192;
+	/*ACQUIRE_LOCK(&p->m[mymspace]->mutex);*/
+	while(age && tc->freeInCache>=THREADCACHEMAXFREESPACE)
+	{
+		RemoveCacheEntries(p, tc, age);
+		/*printf("*** Removing cache entries older than %u (%u)\n", age, (unsigned int) tc->freeInCache);*/
+		age>>=1;
+	}
+	/*RELEASE_LOCK(&p->m[mymspace]->mutex);*/
+}
+static void threadcache_free(nedpool *p, threadcache *tc, int mymspace, void *mem, size_t size) THROWSPEC
+{
+	unsigned int bestsize;
+	unsigned int idx=size2binidx(size);
+	threadcacheblk **binsptr, *tck=(threadcacheblk *) mem;
+	assert(size>=sizeof(threadcacheblk) && size<=THREADCACHEMAX+CHUNK_OVERHEAD);
+#ifdef DEBUG
+	{	/* Make sure this is a valid memory block */
+	    mchunkptr p  = mem2chunk(mem);
+	    mstate fm = get_mstate_for(p);
+	    if (!ok_magic(fm)) {
+	      USAGE_ERROR_ACTION(fm, p);
+	      return;
+	    }
+	}
+#endif
+#ifdef FULLSANITYCHECKS
+	tcfullsanitycheck(tc);
+#endif
+	/* Calculate best fit bin size */
+	bestsize=1<<(idx+4);
+#if 0
+	/* Finer grained bin fit */
+	idx<<=1;
+	if(size>bestsize)
+	{
+		unsigned int biggerbestsize=bestsize+bestsize<<1;
+		if(size>=biggerbestsize)
+		{
+			idx++;
+			bestsize=biggerbestsize;
+		}
+	}
+#endif
+	if(bestsize!=size)	/* dlmalloc can round up, so we round down to preserve indexing */
+		size=bestsize;
+	binsptr=&tc->bins[idx*2];
+	assert(idx<=THREADCACHEMAXBINS);
+	if(tck==*binsptr)
+	{
+		fprintf(stderr, "Attempt to free already freed memory block %p - aborting!\n", tck);
+		abort();
+	}
+#ifdef FULLSANITYCHECKS
+	tck->magic=*(unsigned int *) "NEDN";
+#endif
+	tck->lastUsed=++tc->frees;
+	tck->size=(unsigned int) size;
+	tck->next=*binsptr;
+	tck->prev=0;
+	if(tck->next)
+		tck->next->prev=tck;
+	else
+		binsptr[1]=tck;
+	assert(!*binsptr || (*binsptr)->size==tck->size);
+	*binsptr=tck;
+	assert(tck==tc->bins[idx*2]);
+	assert(tc->bins[idx*2+1]==tck || binsptr[0]->next->prev==tck);
+	/*printf("free: %p, %p, %p, %lu\n", p, tc, mem, (long) size);*/
+	tc->freeInCache+=size;
+#ifdef FULLSANITYCHECKS
+	tcfullsanitycheck(tc);
+#endif
+#if 1
+	if(tc->freeInCache>=THREADCACHEMAXFREESPACE)
+		ReleaseFreeInCache(p, tc, mymspace);
+#endif
+}
+
+
+
+
+static NOINLINE int InitPool(nedpool *p, size_t capacity, int threads) THROWSPEC
+{	/* threads is -1 for system pool */
+	ensure_initialization();
+	ACQUIRE_MALLOC_GLOBAL_LOCK();
+	if(p->threads) goto done;
+	if(INITIAL_LOCK(&p->mutex)) goto err;
+	if(TLSALLOC(&p->mycache)) goto err;
+	if(!(p->m[0]=(mstate) create_mspace(capacity, 1))) goto err;
+	p->m[0]->extp=p;
+	p->threads=(threads<1 || threads>MAXTHREADSINPOOL) ? MAXTHREADSINPOOL : threads;
+done:
+	RELEASE_MALLOC_GLOBAL_LOCK();
+	return 1;
+err:
+	if(threads<0)
+		abort();			/* If you can't allocate for system pool, we're screwed */
+	DestroyCaches(p);
+	if(p->m[0])
+	{
+		destroy_mspace(p->m[0]);
+		p->m[0]=0;
+	}
+	if(p->mycache)
+	{
+		if(TLSFREE(p->mycache)) abort();
+		p->mycache=0;
+	}
+	RELEASE_MALLOC_GLOBAL_LOCK();
+	return 0;
+}
+static NOINLINE mstate FindMSpace(nedpool *p, threadcache *tc, int *lastUsed, size_t size) THROWSPEC
+{	/* Gets called when thread's last used mspace is in use. The strategy
+	is to run through the list of all available mspaces looking for an
+	unlocked one and if we fail, we create a new one so long as we don't
+	exceed p->threads */
+	int n, end;
+	for(n=end=*lastUsed+1; p->m[n]; end=++n)
+	{
+		if(TRY_LOCK(&p->m[n]->mutex)) goto found;
+	}
+	for(n=0; n<*lastUsed && p->m[n]; n++)
+	{
+		if(TRY_LOCK(&p->m[n]->mutex)) goto found;
+	}
+	if(end<p->threads)
+	{
+		mstate temp;
+		if(!(temp=(mstate) create_mspace(size, 1)))
+			goto badexit;
+		/* Now we're ready to modify the lists, we lock */
+		ACQUIRE_LOCK(&p->mutex);
+		while(p->m[end] && end<p->threads)
+			end++;
+		if(end>=p->threads)
+		{	/* Drat, must destroy it now */
+			RELEASE_LOCK(&p->mutex);
+			destroy_mspace((mspace) temp);
+			goto badexit;
+		}
+		/* We really want to make sure this goes into memory now but we
+		have to be careful of breaking aliasing rules, so write it twice */
+		{
+			volatile struct malloc_state **_m=(volatile struct malloc_state **) &p->m[end];
+			*_m=(p->m[end]=temp);
+		}
+		ACQUIRE_LOCK(&p->m[end]->mutex);
+		/*printf("Created mspace idx %d\n", end);*/
+		RELEASE_LOCK(&p->mutex);
+		n=end;
+		goto found;
+	}
+	/* Let it lock on the last one it used */
+badexit:
+	ACQUIRE_LOCK(&p->m[*lastUsed]->mutex);
+	return p->m[*lastUsed];
+found:
+	*lastUsed=n;
+	if(tc)
+		tc->mymspace=n;
+	else
+	{
+		if(TLSSET(p->mycache, (void *)(size_t)(-(n+1)))) abort();
+	}
+	return p->m[n];
+}
+
+nedpool *nedcreatepool(size_t capacity, int threads) THROWSPEC
+{
+	nedpool *ret;
+	if(!(ret=(nedpool *) nedpcalloc(0, 1, sizeof(nedpool)))) return 0;
+	if(!InitPool(ret, capacity, threads))
+	{
+		nedpfree(0, ret);
+		return 0;
+	}
+	return ret;
+}
+void neddestroypool(nedpool *p) THROWSPEC
+{
+	int n;
+	ACQUIRE_LOCK(&p->mutex);
+	DestroyCaches(p);
+	for(n=0; p->m[n]; n++)
+	{
+		destroy_mspace(p->m[n]);
+		p->m[n]=0;
+	}
+	RELEASE_LOCK(&p->mutex);
+	if(TLSFREE(p->mycache)) abort();
+	nedpfree(0, p);
+}
+
+void nedpsetvalue(nedpool *p, void *v) THROWSPEC
+{
+	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
+	p->uservalue=v;
+}
+void *nedgetvalue(nedpool **p, void *mem) THROWSPEC
+{
+	nedpool *np=0;
+	mchunkptr mcp=mem2chunk(mem);
+	mstate fm;
+	if(!(is_aligned(chunk2mem(mcp))) && mcp->head != FENCEPOST_HEAD) return 0;
+	if(!cinuse(mcp)) return 0;
+	if(!next_pinuse(mcp)) return 0;
+	if(!is_mmapped(mcp) && !pinuse(mcp))
+	{
+		if(next_chunk(prev_chunk(mcp))!=mcp) return 0;
+	}
+	fm=get_mstate_for(mcp);
+	if(!ok_magic(fm)) return 0;
+	if(!ok_address(fm, mcp)) return 0;
+	if(!fm->extp) return 0;
+	np=(nedpool *) fm->extp;
+	if(p) *p=np;
+	return np->uservalue;
+}
+
+void neddisablethreadcache(nedpool *p) THROWSPEC
+{
+	int mycache;
+	if(!p)
+	{
+		p=&syspool;
+		if(!syspool.threads) InitPool(&syspool, 0, -1);
+	}
+	mycache=(int)(size_t) TLSGET(p->mycache);
+	if(!mycache)
+	{	/* Set to mspace 0 */
+		if(TLSSET(p->mycache, (void *)-1)) abort();
+	}
+	else if(mycache>0)
+	{	/* Set to last used mspace */
+		threadcache *tc=p->caches[mycache-1];
+#if defined(DEBUG)
+		printf("Threadcache utilisation: %lf%% in cache with %lf%% lost to other threads\n",
+			100.0*tc->successes/tc->mallocs, 100.0*((double) tc->mallocs-tc->frees)/tc->mallocs);
+#endif
+		if(TLSSET(p->mycache, (void *)(size_t)(-tc->mymspace))) abort();
+		tc->frees++;
+		RemoveCacheEntries(p, tc, 0);
+		assert(!tc->freeInCache);
+		tc->mymspace=-1;
+		tc->threadid=0;
+		mspace_free(0, p->caches[mycache-1]);
+		p->caches[mycache-1]=0;
+	}
+}
+
+#define GETMSPACE(m,p,tc,ms,s,action)           \
+  do                                            \
+  {                                             \
+    mstate m = GetMSpace((p),(tc),(ms),(s));    \
+    action;                                     \
+    RELEASE_LOCK(&m->mutex);                    \
+  } while (0)
+
+static FORCEINLINE mstate GetMSpace(nedpool *p, threadcache *tc, int mymspace, size_t size) THROWSPEC
+{	/* Returns a locked and ready for use mspace */
+	mstate m=p->m[mymspace];
+	assert(m);
+	if(!TRY_LOCK(&p->m[mymspace]->mutex)) m=FindMSpace(p, tc, &mymspace, size);\
+	/*assert(IS_LOCKED(&p->m[mymspace]->mutex));*/
+	return m;
+}
+static FORCEINLINE void GetThreadCache(nedpool **p, threadcache **tc, int *mymspace, size_t *size) THROWSPEC
+{
+	int mycache;
+	if(size && *size<sizeof(threadcacheblk)) *size=sizeof(threadcacheblk);
+	if(!*p)
+	{
+		*p=&syspool;
+		if(!syspool.threads) InitPool(&syspool, 0, -1);
+	}
+	mycache=(int)(size_t) TLSGET((*p)->mycache);
+	if(mycache>0)
+	{
+		*tc=(*p)->caches[mycache-1];
+		*mymspace=(*tc)->mymspace;
+	}
+	else if(!mycache)
+	{
+		*tc=AllocCache(*p);
+		if(!*tc)
+		{	/* Disable */
+			if(TLSSET((*p)->mycache, (void *)-1)) abort();
+			*mymspace=0;
+		}
+		else
+			*mymspace=(*tc)->mymspace;
+	}
+	else
+	{
+		*tc=0;
+		*mymspace=-mycache-1;
+	}
+	assert(*mymspace>=0);
+	assert((long)(size_t)CURRENT_THREAD==(*tc)->threadid);
+#ifdef FULLSANITYCHECKS
+	if(*tc)
+	{
+		if(*(unsigned int *)"NEDMALC1"!=(*tc)->magic1 || *(unsigned int *)"NEDMALC2"!=(*tc)->magic2)
+		{
+			abort();
+		}
+	}
+#endif
+}
+
+void * nedpmalloc(nedpool *p, size_t size) THROWSPEC
+{
+	void *ret=0;
+	threadcache *tc;
+	int mymspace;
+	GetThreadCache(&p, &tc, &mymspace, &size);
+#if THREADCACHEMAX
+	if(tc && size<=THREADCACHEMAX)
+	{	/* Use the thread cache */
+		ret=threadcache_malloc(p, tc, &size);
+	}
+#endif
+	if(!ret)
+	{	/* Use this thread's mspace */
+	GETMSPACE(m, p, tc, mymspace, size,
+		  ret=mspace_malloc(m, size));
+	}
+	return ret;
+}
+void * nedpcalloc(nedpool *p, size_t no, size_t size) THROWSPEC
+{
+	size_t rsize=size*no;
+	void *ret=0;
+	threadcache *tc;
+	int mymspace;
+	GetThreadCache(&p, &tc, &mymspace, &rsize);
+#if THREADCACHEMAX
+	if(tc && rsize<=THREADCACHEMAX)
+	{	/* Use the thread cache */
+		if((ret=threadcache_malloc(p, tc, &rsize)))
+			memset(ret, 0, rsize);
+	}
+#endif
+	if(!ret)
+	{	/* Use this thread's mspace */
+	GETMSPACE(m, p, tc, mymspace, rsize,
+		  ret=mspace_calloc(m, 1, rsize));
+	}
+	return ret;
+}
+void * nedprealloc(nedpool *p, void *mem, size_t size) THROWSPEC
+{
+	void *ret=0;
+	threadcache *tc;
+	int mymspace;
+	if(!mem) return nedpmalloc(p, size);
+	GetThreadCache(&p, &tc, &mymspace, &size);
+#if THREADCACHEMAX
+	if(tc && size && size<=THREADCACHEMAX)
+	{	/* Use the thread cache */
+		size_t memsize=nedblksize(mem);
+		assert(memsize);
+		if((ret=threadcache_malloc(p, tc, &size)))
+		{
+			memcpy(ret, mem, memsize<size ? memsize : size);
+			if(memsize<=THREADCACHEMAX)
+				threadcache_free(p, tc, mymspace, mem, memsize);
+			else
+				mspace_free(0, mem);
+		}
+	}
+#endif
+	if(!ret)
+	{	/* Reallocs always happen in the mspace they happened in, so skip
+		locking the preferred mspace for this thread */
+		ret=mspace_realloc(0, mem, size);
+	}
+	return ret;
+}
+void   nedpfree(nedpool *p, void *mem) THROWSPEC
+{	/* Frees always happen in the mspace they happened in, so skip
+	locking the preferred mspace for this thread */
+	threadcache *tc;
+	int mymspace;
+	size_t memsize;
+	assert(mem);
+	GetThreadCache(&p, &tc, &mymspace, 0);
+#if THREADCACHEMAX
+	memsize=nedblksize(mem);
+	assert(memsize);
+	if(mem && tc && memsize<=(THREADCACHEMAX+CHUNK_OVERHEAD))
+		threadcache_free(p, tc, mymspace, mem, memsize);
+	else
+#endif
+		mspace_free(0, mem);
+}
+void * nedpmemalign(nedpool *p, size_t alignment, size_t bytes) THROWSPEC
+{
+	void *ret;
+	threadcache *tc;
+	int mymspace;
+	GetThreadCache(&p, &tc, &mymspace, &bytes);
+	{	/* Use this thread's mspace */
+	GETMSPACE(m, p, tc, mymspace, bytes,
+		  ret=mspace_memalign(m, alignment, bytes));
+	}
+	return ret;
+}
+#if !NO_MALLINFO
+struct mallinfo nedpmallinfo(nedpool *p) THROWSPEC
+{
+	int n;
+	struct mallinfo ret={0};
+	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
+	for(n=0; p->m[n]; n++)
+	{
+		struct mallinfo t=mspace_mallinfo(p->m[n]);
+		ret.arena+=t.arena;
+		ret.ordblks+=t.ordblks;
+		ret.hblkhd+=t.hblkhd;
+		ret.usmblks+=t.usmblks;
+		ret.uordblks+=t.uordblks;
+		ret.fordblks+=t.fordblks;
+		ret.keepcost+=t.keepcost;
+	}
+	return ret;
+}
+#endif
+int    nedpmallopt(nedpool *p, int parno, int value) THROWSPEC
+{
+	return mspace_mallopt(parno, value);
+}
+int    nedpmalloc_trim(nedpool *p, size_t pad) THROWSPEC
+{
+	int n, ret=0;
+	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
+	for(n=0; p->m[n]; n++)
+	{
+		ret+=mspace_trim(p->m[n], pad);
+	}
+	return ret;
+}
+void   nedpmalloc_stats(nedpool *p) THROWSPEC
+{
+	int n;
+	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
+	for(n=0; p->m[n]; n++)
+	{
+		mspace_malloc_stats(p->m[n]);
+	}
+}
+size_t nedpmalloc_footprint(nedpool *p) THROWSPEC
+{
+	size_t ret=0;
+	int n;
+	if(!p) { p=&syspool; if(!syspool.threads) InitPool(&syspool, 0, -1); }
+	for(n=0; p->m[n]; n++)
+	{
+		ret+=mspace_footprint(p->m[n]);
+	}
+	return ret;
+}
+void **nedpindependent_calloc(nedpool *p, size_t elemsno, size_t elemsize, void **chunks) THROWSPEC
+{
+	void **ret;
+	threadcache *tc;
+	int mymspace;
+	GetThreadCache(&p, &tc, &mymspace, &elemsize);
+    GETMSPACE(m, p, tc, mymspace, elemsno*elemsize,
+	      ret=mspace_independent_calloc(m, elemsno, elemsize, chunks));
+	return ret;
+}
+void **nedpindependent_comalloc(nedpool *p, size_t elems, size_t *sizes, void **chunks) THROWSPEC
+{
+	void **ret;
+	threadcache *tc;
+	int mymspace;
+	size_t i, *adjustedsizes=(size_t *) alloca(elems*sizeof(size_t));
+	if(!adjustedsizes) return 0;
+	for(i=0; i<elems; i++)
+		adjustedsizes[i]=sizes[i]<sizeof(threadcacheblk) ? sizeof(threadcacheblk) : sizes[i];
+	GetThreadCache(&p, &tc, &mymspace, 0);
+	GETMSPACE(m, p, tc, mymspace, 0,
+	      ret=mspace_independent_comalloc(m, elems, adjustedsizes, chunks));
+	return ret;
+}
+
+#if defined(__cplusplus)
+}
+#endif
diff --git a/lib/nedmalloc/nedmalloc.h b/lib/nedmalloc/nedmalloc.h
new file mode 100644
index 0000000..78145d2
--- /dev/null
+++ b/lib/nedmalloc/nedmalloc.h
@@ -0,0 +1,182 @@
+/* nedalloc, an alternative malloc implementation for multiple threads without
+lock contention based on dlmalloc v2.8.3. (C) 2005 Niall Douglas
+
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+#ifndef NEDMALLOC_H
+#define NEDMALLOC_H
+
+
+/* See malloc.c.h for what each function does.
+
+REPLACE_SYSTEM_ALLOCATOR causes nedalloc's functions to be called malloc,
+free etc. instead of nedmalloc, nedfree etc. You may or may not want this.
+
+NO_NED_NAMESPACE prevents the functions from being defined in the nedalloc
+namespace when in C++ (uses the global namespace instead).
+
+EXTSPEC can be defined to be __declspec(dllexport) or
+__attribute__ ((visibility("default"))) or whatever you like. It defaults
+to extern.
+
+USE_LOCKS can be 2 if you want to define your own MLOCK_T, INITIAL_LOCK,
+ACQUIRE_LOCK, RELEASE_LOCK, TRY_LOCK, IS_LOCKED and NULL_LOCK_INITIALIZER.
+
+*/
+
+#include <stddef.h>   /* for size_t */
+
+#ifndef EXTSPEC
+ #define EXTSPEC extern
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER>=1400
+ #define MALLOCATTR __declspec(restrict)
+#endif
+#ifdef __GNUC__
+ #define MALLOCATTR __attribute__ ((malloc))
+#endif
+#ifndef MALLOCATTR
+ #define MALLOCATTR
+#endif
+
+#ifdef REPLACE_SYSTEM_ALLOCATOR
+ #define nedmalloc               malloc
+ #define nedcalloc               calloc
+ #define nedrealloc              realloc
+ #define nedfree                 free
+ #define nedmemalign             memalign
+ #define nedposix_memalign       posix_memalign
+ #define nedmallinfo             mallinfo
+ #define nedmallopt              mallopt
+ #define nedmalloc_trim          malloc_trim
+ #define nedmalloc_stats         malloc_stats
+ #define nedmalloc_footprint     malloc_footprint
+ #define nedindependent_calloc   independent_calloc
+ #define nedindependent_comalloc independent_comalloc
+ #ifdef _MSC_VER
+  #define nedblksize              _msize
+ #endif
+#endif
+
+#ifndef NO_MALLINFO
+#define NO_MALLINFO 0
+#endif
+
+#if !NO_MALLINFO
+struct mallinfo;
+#endif
+
+#if defined(__cplusplus)
+ #if !defined(NO_NED_NAMESPACE)
+namespace nedalloc {
+ #else
+extern "C" {
+ #endif
+ #define THROWSPEC throw()
+#else
+ #define THROWSPEC
+#endif
+
+/* These are the global functions */
+
+/* Gets the usable size of an allocated block. Note this will always be bigger than what was
+asked for due to rounding etc.
+*/
+EXTSPEC size_t nedblksize(void *mem) THROWSPEC;
+
+EXTSPEC void nedsetvalue(void *v) THROWSPEC;
+
+EXTSPEC MALLOCATTR void * nedmalloc(size_t size) THROWSPEC;
+EXTSPEC MALLOCATTR void * nedcalloc(size_t no, size_t size) THROWSPEC;
+EXTSPEC MALLOCATTR void * nedrealloc(void *mem, size_t size) THROWSPEC;
+EXTSPEC void   nedfree(void *mem) THROWSPEC;
+EXTSPEC MALLOCATTR void * nedmemalign(size_t alignment, size_t bytes) THROWSPEC;
+EXTSPEC int nedposix_memalign(void** p, size_t alignment, size_t bytes) THROWSPEC;
+#if !NO_MALLINFO
+EXTSPEC struct mallinfo nedmallinfo(void) THROWSPEC;
+#endif
+EXTSPEC int    nedmallopt(int parno, int value) THROWSPEC;
+EXTSPEC int    nedmalloc_trim(size_t pad) THROWSPEC;
+EXTSPEC void   nedmalloc_stats(void) THROWSPEC;
+EXTSPEC size_t nedmalloc_footprint(void) THROWSPEC;
+EXTSPEC MALLOCATTR void **nedindependent_calloc(size_t elemsno, size_t elemsize, void **chunks) THROWSPEC;
+EXTSPEC MALLOCATTR void **nedindependent_comalloc(size_t elems, size_t *sizes, void **chunks) THROWSPEC;
+
+/* These are the pool functions */
+struct nedpool_t;
+typedef struct nedpool_t nedpool;
+
+/* Creates a memory pool for use with the nedp* functions below.
+Capacity is how much to allocate immediately (if you know you'll be allocating a lot
+of memory very soon) which you can leave at zero. Threads specifies how many threads
+will *normally* be accessing the pool concurrently. Setting this to zero means it
+extends on demand, but be careful of this as it can rapidly consume system resources
+where bursts of concurrent threads use a pool at once.
+*/
+EXTSPEC MALLOCATTR nedpool *nedcreatepool(size_t capacity, int threads) THROWSPEC;
+
+/* Destroys a memory pool previously created by nedcreatepool().
+*/
+EXTSPEC void neddestroypool(nedpool *p) THROWSPEC;
+
+/* Sets a value to be associated with a pool. You can retrieve this value by passing
+any memory block allocated from that pool.
+*/
+EXTSPEC void nedpsetvalue(nedpool *p, void *v) THROWSPEC;
+/* Gets a previously set value using nedpsetvalue() or zero if memory is unknown.
+Optionally can also retrieve pool.
+*/
+EXTSPEC void *nedgetvalue(nedpool **p, void *mem) THROWSPEC;
+
+/* Disables the thread cache for the calling thread, returning any existing cache
+data to the central pool.
+*/
+EXTSPEC void neddisablethreadcache(nedpool *p) THROWSPEC;
+
+EXTSPEC MALLOCATTR void * nedpmalloc(nedpool *p, size_t size) THROWSPEC;
+EXTSPEC MALLOCATTR void * nedpcalloc(nedpool *p, size_t no, size_t size) THROWSPEC;
+EXTSPEC MALLOCATTR void * nedprealloc(nedpool *p, void *mem, size_t size) THROWSPEC;
+EXTSPEC void   nedpfree(nedpool *p, void *mem) THROWSPEC;
+EXTSPEC MALLOCATTR void * nedpmemalign(nedpool *p, size_t alignment, size_t bytes) THROWSPEC;
+#if !NO_MALLINFO
+EXTSPEC struct mallinfo nedpmallinfo(nedpool *p) THROWSPEC;
+#endif
+EXTSPEC int    nedpmallopt(nedpool *p, int parno, int value) THROWSPEC;
+EXTSPEC int    nedpmalloc_trim(nedpool *p, size_t pad) THROWSPEC;
+EXTSPEC void   nedpmalloc_stats(nedpool *p) THROWSPEC;
+EXTSPEC size_t nedpmalloc_footprint(nedpool *p) THROWSPEC;
+EXTSPEC MALLOCATTR void **nedpindependent_calloc(nedpool *p, size_t elemsno, size_t elemsize, void **chunks) THROWSPEC;
+EXTSPEC MALLOCATTR void **nedpindependent_comalloc(nedpool *p, size_t elems, size_t *sizes, void **chunks) THROWSPEC;
+
+#if defined(__cplusplus)
+}
+#endif
+
+#undef MALLOCATTR
+#undef EXTSPEC
+
+#endif
diff --git a/lib/simd/simd.h b/lib/simd/simd.h
index 7ff943b..b81005d 100644
--- a/lib/simd/simd.h
+++ b/lib/simd/simd.h
@@ -53,8 +53,16 @@
 #ifdef NEON
 #include "sse2neon.h"
 #else
+#ifdef WASM
+#include "sse2wasm.h"
+#else
+#ifdef __ALTIVEC__
+#include "sse2altivec.h"
+#else
 #include <xmmintrin.h>
 #endif
+#endif
+#endif
 
 #ifdef AVX512
 #include <zmmintrin.h.h> // AVX512
@@ -287,7 +295,7 @@ typedef __m256 simd_float;
 #ifdef SSE
 uint16_t simd_hmax16(const __m128i buffer);
 uint8_t simd_hmax8(const __m128i buffer);
-#ifndef NEON
+#if !defined(NEON) && !defined(WASM) && !defined(__ALTIVEC__)
 #include <smmintrin.h>  //SSE4.1
 // double support
 #ifndef SIMD_DOUBLE
@@ -397,6 +405,28 @@ typedef __m128i simd_int;
 #endif //SIMD_INT
 #endif //SSE
 
+#if defined(WASM) || defined(__ALTIVEC__)
+template <typename F>
+inline F simd_hmax(const F * in, unsigned int n);
+
+inline uint16_t simd_hmax16(const __m128i buffer) {
+    union {
+        uint16_t as_u16[8];
+        __m128i  as_vec;
+    } t;
+    t.as_vec = buffer;
+    return simd_hmax<uint16_t>((uint16_t*)(t.as_u16), 8);
+}
+
+inline uint8_t simd_hmax8(const __m128i buffer) {
+    union {
+        uint8_t  as_u8[16];
+        __m128i  as_vec;
+    } t;
+    t.as_vec = buffer;
+    return simd_hmax<uint8_t>((uint8_t*)(t.as_u8), 16);
+}
+#else
 #ifdef NEON
 inline uint16_t simd_hmax16(const __m128i buffer) {
     uint16x4_t tmp;
@@ -414,20 +444,6 @@ inline uint8_t simd_hmax8(const __m128i buffer) {
     tmp = vpmax_u8(tmp, tmp);
     return vget_lane_u8(tmp, 0);
 }
-#if 0
-template <typename F>
-inline F simd_hmax(const F * in, unsigned int n);
-
-inline uint16_t simd_hmax16(const __m128i buffer) {
-    SIMDVec* tmp = (SIMDVec*)&buffer;
-    return simd_hmax<uint16_t>((uint16_t*)tmp->m128_u16, 8);
-}
-
-inline uint8_t simd_hmax8(const __m128i buffer) {
-    SIMDVec* tmp = (SIMDVec*)&buffer;
-    return simd_hmax<uint8_t>((uint8_t*)tmp->m128_u8, 16);
-}
-#endif
 #else
 inline uint16_t simd_hmax16(const __m128i buffer)
 {
@@ -444,6 +460,7 @@ inline uint8_t simd_hmax8(const __m128i buffer)
     return (int8_t)(255 -(int8_t) _mm_cvtsi128_si32(tmp3));
 }
 #endif
+#endif
 
 #ifdef AVX2
 inline uint16_t simd_hmax16_avx(const __m256i buffer){
@@ -608,7 +625,7 @@ inline float ScalarProd20(const float* qi, const float* tj) {
 //
 //
 //TODO fix this
-#ifdef SSE
+#if defined(SSE) && !defined(WASM) && !defined(__ALTIVEC__)
     float __attribute__((aligned(16))) res;
     __m128 P; // query 128bit SSE2 register holding 4 floats
     __m128 R;// result
diff --git a/lib/simd/sse2altivec.h b/lib/simd/sse2altivec.h
new file mode 100644
index 0000000..01dee9d
--- /dev/null
+++ b/lib/simd/sse2altivec.h
@@ -0,0 +1,127 @@
+// sse2altivec is still very incomplete
+// licensed under GPLv3 see LICENCE file
+#ifndef SSE2ALTIVEC
+#define SSE2ALTIVEC
+
+// ignore all warnings
+#pragma GCC system_header
+
+#include <altivec.h>
+#define SSE 1
+
+typedef __vector double __m128d;
+typedef __vector float __m128;
+typedef __vector int __m128i;
+
+typedef __vector   signed char simd_s8;
+typedef __vector unsigned char simd_u8;
+typedef __vector   signed short simd_s16;
+typedef __vector unsigned short simd_u16;
+typedef __vector int64_t simd_s64;
+typedef __vector uint64_t simd_u64;
+
+#define _mm_add_ps(x,y)          (__m128)((__m128)(x) + (__m128)(y))
+#define _mm_sub_ps(x,y)          (__m128)((__m128)(x) - (__m128)(y))
+#define _mm_mul_ps(x,y)          (__m128)((__m128)(x) * (__m128)(y))
+#define _mm_div_ps(x,y)          (__m128)((__m128)(x) / (__m128)(y))
+#define _mm_rcp_ps(x)            (__m128)vec_re((__m128)(x))
+#define _mm_max_ps(x,y)          (__m128)vec_max((__m128)(x),(__m128)(y))
+#define _mm_min_ps(x,y)          (__m128)vec_min((__m128)(x),(__m128)(y))
+#define _mm_load_ps(x)           (__m128)vec_vsx_ld(0, (__m128 const*)(x))
+#define _mm_store_ps(x,y)        vec_vsx_st((__m128)(y),0,(__m128*)(x))
+#define _mm_store_ss(x,y)        vec_vsx_st((__m128)(y),0,(__m128*)(x))
+#define _mm_set1_ps(x)           (__m128)vec_splats((float)(x))
+#define _mm_setzero_ps(x)        (__m128)vec_splats((float)0)
+#define _mm_cmpgt_ps(x,y)        (__m128)vec_cmpgt((__m128)(x),(__m128)(y))
+#define _mm_cmpeq_ps(x,y)        (__m128)vec_cmpeq((__m128)(x),(__m128)(y))
+#define _mm_cmplt_ps(x,y)        (__m128)vec_cmplt((__m128)(x),(__m128)(y))
+#define _mm_or_ps(x,y)           (__m128)vec_or((__m128)(x),(__m128)(y))
+#define _mm_and_ps(x,y)          (__m128)vec_and((__m128)(x),(__m128)(y))
+#define _mm_andnot_ps(x,y)       (__m128)vec_andc((__m128)(x),(__m128)(y))
+#define _mm_xor_ps(x,y)          (__m128)vec_xor((__m128)(x),(__m128)(y))
+#define _mm_cvtps_epi32(x)       (__m128i)vec_cts((x),0)
+#define _mm_castps_si128(x)      (__m128i)(x)
+#define _mm_add_epi32(x,y)       (__m128i)vec_add((__m128i)(x),(__m128i)(y))
+#define _mm_add_epi16(x,y)       (__m128i)vec_add((simd_s16)(x),(simd_s16)(y))
+#define _mm_add_epi8(x,y)        (__m128i)vec_add((simd_s8)(x),(simd_s8)(y))
+#define _mm_adds_epi16(x,y)      (__m128i)vec_adds((simd_s16)(x),(simd_s16)(y))
+#define _mm_adds_epu8(x,y)       (__m128i)vec_adds((simd_u8)(x),(simd_u8)(y))
+#define _mm_sub_epi32(x,y)       (__m128i)vec_sub((__m128i)(x),(__m128i)(y))
+#define _mm_sub_epi16(x,y)       (__m128i)vec_sub((simd_s16)(x),(simd_s16)(y))
+#define _mm_sub_epi8(x,y)        (__m128i)vec_sub((simd_s8)(x),(simd_s8)(y))
+#define _mm_subs_epu16(x,y)      (__m128i)vec_subs((simd_u16)(x),(simd_u16)(y))
+#define _mm_subs_epu8(x,y)       (__m128i)vec_subs((simd_u8)(x),(simd_u8)(y))
+#define _mm_mullo_epi32(x,y)     (__m128i)vec_mul((__m128i)(x),(__m128i)(y))
+#define _mm_max_epi32(x,y)       (__m128i)vec_max((__m128i)(x),(__m128i)(y))
+#define _mm_max_epi16(x,y)       (__m128i)vec_max((simd_s16)(x),(simd_s16)(y))
+#define _mm_max_epu8(x,y)        (__m128i)vec_max((simd_u8)(x),(simd_u8)(y))
+#define _mm_min_epu8(x,y)        (__m128i)vec_min((simd_u8)(x),(simd_u8)(y))
+#define _mm_load_si128(x)        (__m128i)vec_vsx_ld(0,(__m128i const*)(x))
+#define _mm_loadu_si128(x)       (__m128i)vec_vsx_ld(0,(__m128i const*)(x))
+#define _mm_storeu_si128(x,y)    vec_vsx_st((__m128i)(y),0,(__m128i*)(x))
+#define _mm_store_si128(x,y)     vec_vsx_st((__m128i)(y),0,(__m128i*)(x))
+#define _mm_set1_epi32(x)        (__m128i)vec_splats((signed int)(x))
+#define _mm_set1_epi16(x)        (__m128i)vec_splats((signed short)(x))
+#define _mm_set1_epi8(x)         (__m128i)vec_splats((signed char)(x))
+#define _mm_setzero_si128(x)     (__m128i)vec_splats(0)
+#define _mm_cmpgt_epi32(x,y)     (__m128i)vec_cmpgt((__m128i)(x),(__m128i)(y))
+#define _mm_cmpgt_epi16(x,y)     (__m128i)vec_cmpgt((simd_s16)(x),(simd_s16)(y))
+#define _mm_cmpgt_epi8(x,y)      (__m128i)vec_cmpgt((simd_s8)(x),(simd_s8)(y))
+#define _mm_cmpeq_epi32(x,y)     (__m128i)vec_cmpeq((__m128i)(x),(__m128i)(y))
+#define _mm_cmpeq_epi16(x,y)     (__m128i)vec_cmpeq((simd_s16)(x),(simd_s16)(y))
+#define _mm_cmpeq_epi8(x,y)      (__m128i)vec_cmpeq((simd_s8)(x),(simd_s8)(y))
+#define _mm_cmplt_epi32(x,y)     (__m128i)vec_cmplt((__m128i)(x),(__m128i)(y))
+#define _mm_cmplt_epi16(x,y)     (__m128i)vec_cmplt((simd_s16)(x),(simd_s16)(y))
+#define _mm_cmplt_epi8(x,y)      (__m128i)vec_cmplt((simd_s8)(x),(simd_s8)(y))
+#define _mm_or_si128(x,y)        (__m128i)vec_or((__m128i)(x),(__m128i)(y))
+#define _mm_and_si128(x,y)       (__m128i)vec_and((__m128i)(x),(__m128i)(y))
+#define _mm_andnot_si128(x,y)    (__m128i)vec_andc((__m128i)(x),(__m128i)(y))
+#define _mm_xor_si128(x,y)       (__m128i)vec_xor((__m128i)(x),(__m128i)(y))
+#define _mm_extract_epi16(x,imm) (int16_t)vec_extract((simd_s16)(x),(imm))
+#define _mm_extract_epi8(x,imm)  (int8_t)vec_extract((simd_s8)(x),(imm))
+#define _mm_slli_epi16(x,y)      (simd_s16)vec_sl((simd_s16)(x),vec_splats((unsigned short)(y)))
+#define _mm_srli_epi16(x,y)      (simd_s16)vec_sr((simd_s16)(x),vec_splats((unsigned short)(y)))
+#define _mm_slli_epi32(x,y)      (__m128i)vec_sl((__m128i)(x),vec_splats((unsigned int)(y)))
+#define _mm_srli_epi32(x,y)      (__m128i)vec_sr((__m128i)(x),vec_splats((unsigned int)(y)))
+#define _mm_cvtepi32_ps(x)       (__m128)vec_ctf((__m128i)(x),0)
+#define _mm_castsi128_ps(x)      (__m128)(x)
+#define _mm_slli_si128(x,y)      (__m128i)vec_slo((simd_u8)(x),(simd_u8)vec_splats((char)(y << 3)))
+#define _mm_srli_si128(x,y)      (__m128i)vec_sro((simd_u8)(x),(simd_u8)vec_splats((char)(y << 3)))
+#define _mm_cvtsi128_si64(a)     (int64_t)vec_extract((simd_s64)(a),0)
+#define _mm_cvtsi128_si32(a)     (int32_t)vec_extract((__m128i)(a),0)
+#define _mm_cvtsi64_si128(a)     (__m128i)((simd_s64){(int64_t)(a),0})
+#define _mm_cvtsi32_si128(a)     (__m128i){(int)(a),0,0,0}
+#define _mm_packs_epi32(x,y)     (simd_s16)vec_packs((__m128i)(x), (__m128i)(y))
+#define _mm_packus_epi16(x,y)    (simd_u8)vec_packsu((simd_s16)(x), (simd_s16)(y))
+#define _mm_set_epi32(e3,e2,e1,e0)  (__m128i){(e0),(e1),(e2),(e3)}
+#define _mm_setr_epi32(e3,e2,e1,e0) (__m128i){(e3),(e2),(e1),(e0)}
+#define _mm_set_epi16(e7,e6,e5,e4,e3,e2,e1,e0) \
+                                    (__m128i)((simd_s16){(e0),(e1),(e2),(e3),(e4),(e5),(e6),(e7)})
+#define _mm_setr_epi16(e7,e6,e5,e4,e3,e2,e1,e0) \
+                                    (__m128i)((simd_s16){(e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0)})
+#define _mm_set_epi8(e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0) \
+                                    (__m128i)((simd_s8){(e0),(e1),(e2),(e3),(e4),(e5),(e6),(e7),(e8),(e9),(e10),(e11),(e12),(e13),(e14),(e15)})
+#define _mm_setr_epi8(e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0) \
+                                    (__m128i)((simd_s8){(e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0)})
+
+static inline void _mm_storel_epi64(__m128i* mem_addr, __m128i a) {
+    *((int64_t*)mem_addr) = (int64_t)vec_extract((simd_s64)(a), 0);
+}
+
+// From OpenCV
+// https://github.com/opencv/opencv/pull/15235
+// 3-Clause BSD License
+static inline unsigned short _mm_movemask_epi8(__m128i value) {
+    static const simd_u8 perm = {120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0};
+    return vec_extract((__m128i)vec_vbpermq((simd_u8)value, perm), 2);
+}
+
+// From reedsolomon
+// https://github.com/NicolasT/reedsolomon/blob/master/cbits/reedsolomon.c
+// MIT License
+static inline __m128i _mm_shuffle_epi8(__m128i a, __m128i b) {
+  const __m128i zero = (__m128i)vec_splats((unsigned char)0);
+  return (__m128i)vec_perm((simd_u8)a, (simd_u8)zero, (simd_u8)b);
+}
+
+#endif
diff --git a/lib/simd/sse2neon.h b/lib/simd/sse2neon.h
index 7734063..9126b38 100644
--- a/lib/simd/sse2neon.h
+++ b/lib/simd/sse2neon.h
@@ -1,1753 +1,3532 @@
 #ifndef SSE2NEON_H
 #define SSE2NEON_H
 
+// ignore all warnings
+#pragma GCC system_header
+
 // This header file provides a simple API translation layer
-// between SSE intrinsics to their corresponding ARM NEON versions
-//
-// This header file does not (yet) translate *all* of the SSE intrinsics.
-// Since this is in support of a specific porting effort, I have only
-// included the intrinsics I needed to get my port to work.
-//
-// Questions/Comments/Feedback send to: jratcliffscarab@gmail.com
-//
-// If you want to improve or add to this project, send me an
-// email and I will probably approve your access to the depot.
-//
-// Project is located here:
-//
-//	https://github.com/jratcliff63367/sse2neon
-//
-// Show your appreciation for open source by sending me a bitcoin tip to the following
-// address.
-//
-// TipJar: 1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p :
-// https://blockchain.info/address/1PzgWDSyq4pmdAXRH8SPUtta4SWGrt4B1p
-//
-//
-// Contributors to this project are:
-//
-// John W. Ratcliff     : jratcliffscarab@gmail.com
-// Brandon Rowlett      : browlett@nvidia.com
-// Ken Fast             : kfast@gdeb.com
-// Eric van Beurden     : evanbeurden@nvidia.com
-// Alexander Potylitsin : apotylitsin@nvidia.com
-//
-//
-// *********************************************************************************************************************
-// apoty: March 17, 2017
-// Current version was changed in most to fix issues and potential issues.
-// All unit tests were rewritten as a part of forge lib project to cover all implemented functions.
-// *********************************************************************************************************************
-// Release notes for January 20, 2017 version:
-//
-// The unit tests have been refactored.  They no longer assert on an error, instead they return a pass/fail condition
-// The unit-tests now test 10,000 random float and int values against each intrinsic.
+// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
 //
-// SSE2NEON now supports 95 SSE intrinsics.  39 of them have formal unit tests which have been implemented and
-// fully tested on NEON/ARM.  The remaining 56 still need unit tests implemented.
+// This header file does not yet translate all of the SSE intrinsics.
 //
-// A struct is now defined in this header file called 'SIMDVec' which can be used by applications which
-// attempt to access the contents of an _m128 struct directly.  It is important to note that accessing the __m128
-// struct directly is bad coding practice by Microsoft: @see: https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
-//
-// However, some legacy source code may try to access the contents of an __m128 struct directly so the developer
-// can use the SIMDVec as an alias for it.  Any casting must be done manually by the developer, as you cannot
-// cast or otherwise alias the base NEON data type for intrinsic operations.
-//
-// A bug was found with the _mm_shuffle_ps intrinsic.  If the shuffle permutation was not one of the ones with
-// a custom/unique implementation causing it to fall through to the default shuffle implementation it was failing
-// to return the correct value.  This is now fixed.
-//
-// A bug was found with the _mm_cvtps_epi32 intrinsic.  This converts floating point values to integers.
-// It was not honoring the correct rounding mode.  In SSE the default rounding mode when converting from float to int
-// is to use 'round to even' otherwise known as 'bankers rounding'.  ARMv7 did not support this feature but ARMv8 does.
-// As it stands today, this header file assumes ARMv8.  If you are trying to target really old ARM devices, you may get
-// a build error.
-//
-// Support for a number of new intrinsics was added, however, none of them yet have unit-tests to 100% confirm they are
-// producing the correct results on NEON.  These unit tests will be added as soon as possible.
-//
-// Here is the list of new instrinsics which have been added:
-//
-// _mm_cvtss_f32     :  extracts the lower order floating point value from the parameter
-// _mm_add_ss        : adds the scalar single - precision floating point values of a and b
-// _mm_div_ps        : Divides the four single - precision, floating - point values of a and b.
-// _mm_div_ss        : Divides the scalar single - precision floating point value of a by b.
-// _mm_sqrt_ss       : Computes the approximation of the square root of the scalar single - precision floating point value of in.
-// _mm_rsqrt_ps      : Computes the approximations of the reciprocal square roots of the four single - precision floating point values of in.
-// _mm_comilt_ss     : Compares the lower single - precision floating point scalar values of a and b using a less than operation
-// _mm_comigt_ss     : Compares the lower single - precision floating point scalar values of a and b using a greater than operation.
-// _mm_comile_ss     :  Compares the lower single - precision floating point scalar values of a and b using a less than or equal operation.
-// _mm_comige_ss     : Compares the lower single - precision floating point scalar values of a and b using a greater than or equal operation.
-// _mm_comieq_ss     :  Compares the lower single - precision floating point scalar values of a and b using an equality operation.
-// _mm_comineq_s     :  Compares the lower single - precision floating point scalar values of a and b using an inequality operation
-// _mm_unpackhi_epi8 : Interleaves the upper 8 signed or unsigned 8 - bit integers in a with the upper 8 signed or unsigned 8 - bit integers in b.
-// _mm_unpackhi_epi16:  Interleaves the upper 4 signed or unsigned 16 - bit integers in a with the upper 4 signed or unsigned 16 - bit integers in b.
-//
-// *********************************************************************************************************************
+// Contributors to this work are:
+//   John W. Ratcliff <jratcliffscarab@gmail.com>
+//   Brandon Rowlett <browlett@nvidia.com>
+//   Ken Fast <kfast@gdeb.com>
+//   Eric van Beurden <evanbeurden@nvidia.com>
+//   Alexander Potylitsin <apotylitsin@nvidia.com>
+//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
+//   Jim Huang <jserv@biilabs.io>
+//   Mark Cheng <marktwtn@biilabs.io>
+//   Malcolm James MacLeod <malcolm@gulden.com>
+//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
+
 /*
-** The MIT license:
-**
-** Permission is hereby granted, free of charge, to any person obtaining a copy
-** of this software and associated documentation files (the "Software"), to deal
-** in the Software without restriction, including without limitation the rights
-** to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-** copies of the Software, and to permit persons to whom the Software is furnished
-** to do so, subject to the following conditions:
-**
-** The above copyright notice and this permission notice shall be included in all
-** copies or substantial portions of the Software.
-
-** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-** IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-** AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-** WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-** CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#define ENABLE_CPP_VERSION 0
+ * The MIT license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
 
 #if defined(__GNUC__) || defined(__clang__)
-#	pragma push_macro("FORCE_INLINE")
-#	pragma push_macro("ALIGN_STRUCT")
-#	define FORCE_INLINE       static inline __attribute__((always_inline))
-#	define ALIGN_STRUCT(x)    __attribute__((aligned(x)))
-#   define EXTENSION		  __extension__
+
+#pragma push_macro("FORCE_INLINE")
+#pragma push_macro("ALIGN_STRUCT")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+
 #else
-#	error "Macro name collisions may happens with unknown compiler"
-#	define FORCE_INLINE       static inline
-#	define ALIGN_STRUCT(x)    __declspec(align(x))
-#   define EXTENSION
+
+#error "Macro name collisions may happens with unknown compiler"
+#ifdef FORCE_INLINE
+#undef FORCE_INLINE
+#endif
+#define FORCE_INLINE static inline
+#ifndef ALIGN_STRUCT
+#define ALIGN_STRUCT(x) __declspec(align(x))
+#endif
+
 #endif
 
 #include <stdint.h>
-#include "arm_neon.h"
-
-
-/*******************************************************/
-/* MACRO for shuffle parameter for _mm_shuffle_ps().   */
-/* Argument fp3 is a digit[0123] that represents the fp*/
-/* from argument "b" of mm_shuffle_ps that will be     */
-/* placed in fp3 of result. fp2 is the same for fp2 in */
-/* result. fp1 is a digit[0123] that represents the fp */
-/* from argument "a" of mm_shuffle_ps that will be     */
-/* places in fp1 of result. fp0 is the same for fp0 of */
-/* result                                              */
-/*******************************************************/
-#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
-	(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+#include <arm_neon.h>
+
+/**
+ * MACRO for shuffle parameter for _mm_shuffle_ps().
+ * Argument fp3 is a digit[0123] that represents the fp from argument "b"
+ * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
+ * for fp2 in result. fp1 is a digit[0123] that represents the fp from
+ * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
+ * fp0 is the same for fp0 of result.
+ */
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
 
 /* indicate immediate constant argument in a given range */
-#define __constrange(a,b) \
-	const
+#define __constrange(a, b) const
 
+typedef float32x2_t __m64;
 typedef float32x4_t __m128;
-typedef int32x4_t __m128i;
-
+typedef int64x2_t __m128i;
 
 // ******************************************
 // type-safe casting between types
 // ******************************************
 
-#define vreinterpretq_m128_f16(x) \
-	vreinterpretq_f32_f16(x)
-
-#define vreinterpretq_m128_f32(x) \
-	(x)
-
-#define vreinterpretq_m128_f64(x) \
-	vreinterpretq_f32_f64(x)
-
-
-#define vreinterpretq_m128_u8(x) \
-	vreinterpretq_f32_u8(x)
-
-#define vreinterpretq_m128_u16(x) \
-	vreinterpretq_f32_u16(x)
-
-#define vreinterpretq_m128_u32(x) \
-	vreinterpretq_f32_u32(x)
-
-#define vreinterpretq_m128_u64(x) \
-	vreinterpretq_f32_u64(x)
-
-
-#define vreinterpretq_m128_s8(x) \
-	vreinterpretq_f32_s8(x)
-
-#define vreinterpretq_m128_s16(x) \
-	vreinterpretq_f32_s16(x)
-
-#define vreinterpretq_m128_s32(x) \
-	vreinterpretq_f32_s32(x)
-
-#define vreinterpretq_m128_s64(x) \
-	vreinterpretq_f32_s64(x)
-
-
-#define vreinterpretq_f16_m128(x) \
-	vreinterpretq_f16_f32(x)
-
-#define vreinterpretq_f32_m128(x) \
-	(x)
-
-#define vreinterpretq_f64_m128(x) \
-	vreinterpretq_f64_f32(x)
-
-
-#define vreinterpretq_u8_m128(x) \
-	vreinterpretq_u8_f32(x)
-
-#define vreinterpretq_u16_m128(x) \
-	vreinterpretq_u16_f32(x)
-
-#define vreinterpretq_u32_m128(x) \
-	vreinterpretq_u32_f32(x)
-
-#define vreinterpretq_u64_m128(x) \
-	vreinterpretq_u64_f32(x)
-
-
-#define vreinterpretq_s8_m128(x) \
-	vreinterpretq_s8_f32(x)
-
-#define vreinterpretq_s16_m128(x) \
-	vreinterpretq_s16_f32(x)
-
-#define vreinterpretq_s32_m128(x) \
-	vreinterpretq_s32_f32(x)
-
-#define vreinterpretq_s64_m128(x) \
-	vreinterpretq_s64_f32(x)
-
-
-#define vreinterpretq_m128i_s8(x) \
-	vreinterpretq_s32_s8(x)
-
-#define vreinterpretq_m128i_s16(x) \
-	vreinterpretq_s32_s16(x)
-
-#define vreinterpretq_m128i_s32(x) \
-	(x)
-
-#define vreinterpretq_m128i_s64(x) \
-	vreinterpretq_s32_s64(x)
-
-
-#define vreinterpretq_m128i_u8(x) \
-	vreinterpretq_s32_u8(x)
-
-#define vreinterpretq_m128i_u16(x) \
-	vreinterpretq_s32_u16(x)
-
-#define vreinterpretq_m128i_u32(x) \
-	vreinterpretq_s32_u32(x)
-
-#define vreinterpretq_m128i_u64(x) \
-	vreinterpretq_s32_u64(x)
-
-
-#define vreinterpretq_s8_m128i(x) \
-	vreinterpretq_s8_s32(x)
-
-#define vreinterpretq_s16_m128i(x) \
-	vreinterpretq_s16_s32(x)
-
-#define vreinterpretq_s32_m128i(x) \
-	(x)
-
-#define vreinterpretq_s64_m128i(x) \
-	vreinterpretq_s64_s32(x)
-
-
-#define vreinterpretq_u8_m128i(x) \
-	vreinterpretq_u8_s32(x)
-
-#define vreinterpretq_u16_m128i(x) \
-	vreinterpretq_u16_s32(x)
+#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
+#define vreinterpretq_m128_f32(x) (x)
+#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
+
+#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
+#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
+#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
+#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
+#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
+#define vreinterpretq_f32_m128(x) (x)
+#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
+
+#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
+#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
+#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
+#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
+#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
+#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
+#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
+#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
+#define vreinterpretq_m128i_s64(x) (x)
+
+#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
+#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
+#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
+#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
+
+#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
+#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
+#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
+#define vreinterpretq_s64_m128i(x) (x)
+
+#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
+#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
+#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
+#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
+
+// A struct is defined in this header file called 'SIMDVec' which can be used
+// by applications which attempt to access the contents of an _m128 struct
+// directly.  It is important to note that accessing the __m128 struct directly
+// is bad coding practice by Microsoft: @see:
+// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
+//
+// However, some legacy source code may try to access the contents of an __m128
+// struct directly so the developer can use the SIMDVec as an alias for it.  Any
+// casting must be done manually by the developer, as you cannot cast or
+// otherwise alias the base NEON data type for intrinsic operations.
+//
+// union intended to allow direct access to an __m128 variable using the names
+// that the MSVC compiler provides.  This union should really only be used when
+// trying to access the members of the vector as integer values.  GCC/clang
+// allow native access to the float members through a simple array access
+// operator (in C since 4.6, in C++ since 4.8).
+//
+// Ideally direct accesses to SIMD vectors should not be used since it can cause
+// a performance hit.  If it really is needed however, the original __m128
+// variable can be aliased with a pointer to this union and used to access
+// individual components.  The use of this union should be hidden behind a macro
+// that is used throughout the codebase to access the members instead of always
+// declaring this type of variable.
+typedef union ALIGN_STRUCT(16) SIMDVec {
+    float m128_f32[4];     // as floats - do not to use this.  Added for convenience.
+    int8_t m128_i8[16];    // as signed 8-bit integers.
+    int16_t m128_i16[8];   // as signed 16-bit integers.
+    int32_t m128_i32[4];   // as signed 32-bit integers.
+    int64_t m128_i64[2];   // as signed 64-bit integers.
+    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
+    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
+    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
+    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
+} SIMDVec;
 
-#define vreinterpretq_u32_m128i(x) \
-	vreinterpretq_u32_s32(x)
+// casting using SIMDVec
+#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
+#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
 
-#define vreinterpretq_u64_m128i(x) \
-	vreinterpretq_u64_s32(x)
 
+// ******************************************
+// Backwards compatibility for compilers with lack of specific type support
+// ******************************************
 
-// union intended to allow direct access to an __m128 variable using the names that the MSVC
-// compiler provides.  This union should really only be used when trying to access the members
-// of the vector as integer values.  GCC/clang allow native access to the float members through
-// a simple array access operator (in C since 4.6, in C++ since 4.8).
-//
-// Ideally direct accesses to SIMD vectors should not be used since it can cause a performance
-// hit.  If it really is needed however, the original __m128 variable can be aliased with a
-// pointer to this union and used to access individual components.  The use of this union should
-// be hidden behind a macro that is used throughout the codebase to access the members instead
-// of always declaring this type of variable.
-typedef union ALIGN_STRUCT(16) SIMDVec
+// Older gcc does not define vld1q_u8_x4 type
+#if defined(__GNUC__) && !defined(__clang__)
+#if __GNUC__ < 9 || (__GNUC__ == 9 && (__GNUC_MINOR__ <= 2))
+FORCE_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t *p)
 {
-	float       m128_f32[4];    // as floats - do not to use this.  Added for convenience.
-	int8_t      m128_i8[16];    // as signed 8-bit integers.
-	int16_t     m128_i16[8];    // as signed 16-bit integers.
-	int32_t     m128_i32[4];    // as signed 32-bit integers.
-	int64_t     m128_i64[2];    // as signed 64-bit integers.
-	uint8_t     m128_u8[16];    // as unsigned 8-bit integers.
-	uint16_t    m128_u16[8];    // as unsigned 16-bit integers.
-	uint32_t    m128_u32[4];    // as unsigned 32-bit integers.
-	uint64_t    m128_u64[2];    // as unsigned 64-bit integers.
-} SIMDVec;
+    uint8x16x4_t ret;
+    ret.val[0] = vld1q_u8(p + 0);
+    ret.val[1] = vld1q_u8(p + 16);
+    ret.val[2] = vld1q_u8(p + 32);
+    ret.val[3] = vld1q_u8(p + 48);
+    return ret;
+}
+#endif
+#endif
 
 
 // ******************************************
 // Set/get methods
 // ******************************************
 
-// extracts the lower order floating point value from the parameter : https://msdn.microsoft.com/en-us/library/bb514059%28v=vs.120%29.aspx?f=255&MSPPError=-2147217396
+// Loads one cache line of data from address p to a location closer to the
+// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
+FORCE_INLINE void _mm_prefetch(const void *p, int i)
+{
+    (void)i;
+    __builtin_prefetch(p);
+}
+
+// extracts the lower order floating point value from the parameter :
+// https://msdn.microsoft.com/en-us/library/bb514059%28v=vs.120%29.aspx?f=255&MSPPError=-2147217396
 FORCE_INLINE float _mm_cvtss_f32(__m128 a)
 {
-	return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
 }
 
-// Sets the 128-bit value to zero https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_setzero_si128()
+// Sets the 128-bit value to zero
+// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_setzero_si128(void)
 {
-	return vreinterpretq_m128i_s32(vdupq_n_s32(0));
+    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
 }
 
-// Clears the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
+// Clears the four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_setzero_ps(void)
 {
-	return vreinterpretq_m128_f32(vdupq_n_f32(0));
+    return vreinterpretq_m128_f32(vdupq_n_f32(0));
 }
 
-// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+// Sets the four single-precision, floating-point values to w.
+//
+//   r0 := r1 := r2 := r3 := w
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_set1_ps(float _w)
 {
-	return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
 }
 
-// Sets the four single-precision, floating-point values to w. https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+// Sets the four single-precision, floating-point values to w.
+// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_set_ps1(float _w)
 {
-	return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
 }
 
-// Sets the four single-precision, floating-point values to the four inputs. https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
+// Sets the four single-precision, floating-point values to the four inputs.
+// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
 {
-	float __attribute__((aligned(16))) data[4] = { x, y, z, w };
-	return vreinterpretq_m128_f32(vld1q_f32(data));
+    float __attribute__((aligned(16))) data[4] = {x, y, z, w};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
 }
 
-// Sets the four single-precision, floating-point values to the four inputs in reverse order. https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_setr_ps(float w, float z , float y , float x )
+// Sets the four single-precision, floating-point values to the four inputs in
+// reverse order.
+// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
 {
-	float __attribute__ ((aligned (16))) data[4] = { w, z, y, x };
-	return vreinterpretq_m128_f32(vld1q_f32(data));
+    float __attribute__((aligned(16))) data[4] = {w, z, y, x};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
 }
 
-
-//added by hasindu
-//Sets the 4 signed 32-bit integer values in reverse order https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
+// Sets the 8 signed 16-bit integer values in reverse order.
+//
+// Return Value
+//   r0 := w0
+//   r1 := w1
+//   ...
+//   r7 := w7
+FORCE_INLINE __m128i _mm_setr_epi16(short w0,
+                                    short w1,
+                                    short w2,
+                                    short w3,
+                                    short w4,
+                                    short w5,
+                                    short w6,
+                                    short w7)
+{
+    int16_t __attribute__((aligned(16)))
+            data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
+    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
+}
+
+// Sets the 4 signed 32-bit integer values in reverse order
+// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
 FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
 {
-	int32_t __attribute__((aligned(16))) data[4] = { i3, i2, i1, i0 };
-	return vreinterpretq_m128i_s32(vld1q_s32(data));
+    int32_t __attribute__((aligned(16))) data[4] = {i3, i2, i1, i0};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
 }
 
-//following added by hasindu
-//Sets the 16 signed 8-bit integer values to b.https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_set1_epi8(char w)
+// Sets the 16 signed 8-bit integer values to b.
+//
+//   r0 := b
+//   r1 := b
+//   ...
+//   r15 := b
+//
+// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
 {
-	return vreinterpretq_m128i_s8(vdupq_n_s8(w));
+    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
 }
 
-
-//following added by hasindu
-//Sets the 8 signed 16-bit integer values to w. https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
+// Sets the 8 signed 16-bit integer values to w.
+//
+//   r0 := w
+//   r1 := w
+//   ...
+//   r7 := w
+//
+// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
 FORCE_INLINE __m128i _mm_set1_epi16(short w)
 {
-	return vreinterpretq_m128i_s16(vdupq_n_s16(w));
+    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
+}
+
+// Sets the 16 signed 8-bit integer values.
+// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
+                                  signed char b14,
+                                  signed char b13,
+                                  signed char b12,
+                                  signed char b11,
+                                  signed char b10,
+                                  signed char b9,
+                                  signed char b8,
+                                  signed char b7,
+                                  signed char b6,
+                                  signed char b5,
+                                  signed char b4,
+                                  signed char b3,
+                                  signed char b2,
+                                  signed char b1,
+                                  signed char b0)
+{
+    int8_t __attribute__((aligned(16)))
+            data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                        (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                        (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                        (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Sets the 8 signed 16-bit integer values.
+// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set_epi16(short i7,
+                                   short i6,
+                                   short i5,
+                                   short i4,
+                                   short i3,
+                                   short i2,
+                                   short i1,
+                                   short i0)
+{
+    int16_t __attribute__((aligned(16)))
+            data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+    return vreinterpretq_m128i_s16(vld1q_s16(data));
+}
+
+// Sets the 16 signed 8-bit integer values in reverse order.
+// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
+                                   signed char b1,
+                                   signed char b2,
+                                   signed char b3,
+                                   signed char b4,
+                                   signed char b5,
+                                   signed char b6,
+                                   signed char b7,
+                                   signed char b8,
+                                   signed char b9,
+                                   signed char b10,
+                                   signed char b11,
+                                   signed char b12,
+                                   signed char b13,
+                                   signed char b14,
+                                   signed char b15)
+{
+    int8_t __attribute__((aligned(16)))
+            data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                        (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                        (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                        (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Sets the 4 signed 32-bit integer values to i.
+//
+//   r0 := i
+//   r1 := i
+//   r2 := i
+//   r3 := I
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set1_epi32(int _i)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
 }
 
-//following added by hasindu
-//Sets the 8 signed 16-bit integer values. https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_set_epi16(short i7, short i6, short i5, short i4, short i3, short i2, short i1, short i0)
+// Sets the 2 signed 64-bit integer values to i.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
+FORCE_INLINE __m128i _mm_set1_epi64(int64_t _i)
 {
-	int16_t __attribute__((aligned(16))) data[8] = { i0, i1, i2, i3, i4, i5, i6, i7 };
-	return vreinterpretq_m128i_s16(vld1q_s16(data));
+    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
 }
 
-
-// Sets the 4 signed 32-bit integer values to i. https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_set1_epi32(int _i)
+// Sets the 2 signed 64-bit integer values to i.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x&expand=4961
+FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
 {
-	return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
+    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
 }
 
-// Sets the 4 signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
+// Sets the 4 signed 32-bit integer values.
+// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
 {
-	int32_t __attribute__((aligned(16))) data[4] = { i0, i1, i2, i3 };
-	return vreinterpretq_m128i_s32(vld1q_s32(data));
+    int32_t __attribute__((aligned(16))) data[4] = {i0, i1, i2, i3};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Returns the __m128i structure with its two 64-bit integer values
+// initialized to the values of the two 64-bit integers passed in.
+// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
+{
+    int64_t __attribute__((aligned(16))) data[2] = {i2, i1};
+    return vreinterpretq_m128i_s64(vld1q_s64(data));
 }
 
-// Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
+// Stores four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
 FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
 {
-	vst1q_f32(p, vreinterpretq_f32_m128(a));
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
 }
 
-// Stores four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
+// Stores four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
 FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
 {
-	vst1q_f32(p, vreinterpretq_f32_m128(a));
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
 }
 
-// Stores four 32-bit integer values as (as a __m128i value) at the address p. https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
+// Stores four 32-bit integer values as (as a __m128i value) at the address p.
+// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
 FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
 {
-	vst1q_s32((int32_t*) p, vreinterpretq_s32_m128i(a));
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
 }
 
-//added by hasindu (verify this for requirement of alignment)
-// Stores four 32-bit integer values as (as a __m128i value) at the address p. https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
+// Stores four 32-bit integer values as (as a __m128i value) at the address p.
+// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
 FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
 {
-	vst1q_s32((int32_t*) p, vreinterpretq_s32_m128i(a));
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
 }
 
-// Stores the lower single - precision, floating - point value. https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
+// Stores the lower single - precision, floating - point value.
+// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
 FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
 {
-	vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
+    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
 }
 
-// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.  https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_storel_epi64(__m128i* a, __m128i b)
+// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
+// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
 {
-	uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
-	uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
-	*a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
+    uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
+    uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
+    *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
 }
 
-// Loads a single single-precision, floating-point value, copying it into all four words https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_load1_ps(const float * p)
+// Stores the lower two single-precision floating point values of a to the
+// address p.
+//
+//   *p0 := b0
+//   *p1 := b1
+//
+// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
+FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
 {
-	return vreinterpretq_m128_f32(vld1q_dup_f32(p));
+    *p = vget_low_f32(a);
 }
 
-// Loads four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_load_ps(const float * p)
+// Loads a single single-precision, floating-point value, copying it into all
+// four words
+// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load1_ps(const float *p)
 {
-	return vreinterpretq_m128_f32(vld1q_f32(p));
+    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
 }
+#define _mm_load_ps1 _mm_load1_ps
 
-// Loads four single-precision, floating-point values.  https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_loadu_ps(const float * p)
+// Sets the lower two single-precision, floating-point values with 64
+// bits of data loaded from the address p; the upper two values are passed
+// through from a.
+//
+// Return Value
+//   r0 := *p0
+//   r1 := *p1
+//   r2 := a2
+//   r3 := a3
+//
+// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *b)
+{
+    return vreinterpretq_m128_f32(
+            vcombine_f32(vld1_f32((const float32_t *) b), vget_high_f32(a)));
+}
+
+// Loads four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Loads four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
 {
-	// for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are equivalent for neon
-	return vreinterpretq_m128_f32(vld1q_f32(p));
+    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
+    // equivalent for neon
+    return vreinterpretq_m128_f32(vld1q_f32(p));
 }
 
-// Loads an single - precision, floating - point value into the low word and clears the upper three words.  https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_load_ss(const float * p)
+// Loads an single - precision, floating - point value into the low word and
+// clears the upper three words.
+// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_load_ss(const float *p)
 {
-	return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
+    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
 }
 
+FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
+{
+    /* Load the lower 64 bits of the value pointed to by p into the
+     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
+     */
+    return vreinterpretq_m128i_s32(vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
+}
 
 // ******************************************
 // Logic/Binary operations
 // ******************************************
 
-// Compares for inequality.  https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
+// Compares for inequality.
+// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
 {
-	return vreinterpretq_m128_u32( vmvnq_u32( vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)) ) );
+    return vreinterpretq_m128_u32(vmvnq_u32(
+            vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
 }
 
-// Computes the bitwise AND-NOT of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
+// Computes the bitwise AND-NOT of the four single-precision, floating-point
+// values of a and b.
+//
+//   r0 := ~a0 & b0
+//   r1 := ~a1 & b1
+//   r2 := ~a2 & b2
+//   r3 := ~a3 & b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
 {
-	return vreinterpretq_m128_s32( vbicq_s32(vreinterpretq_s32_m128(b), vreinterpretq_s32_m128(a)) ); // *NOTE* argument swap
+    return vreinterpretq_m128_s32(
+            vbicq_s32(vreinterpretq_s32_m128(b),
+                      vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
 }
 
-// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the 128-bit value in a. https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
+// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
+// 128-bit value in a.
+//
+//   r := (~a) & b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_s32( vbicq_s32(vreinterpretq_s32_m128i(b), vreinterpretq_s32_m128i(a)) ); // *NOTE* argument swap
+    return vreinterpretq_m128i_s32(
+            vbicq_s32(vreinterpretq_s32_m128i(b),
+                      vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
 }
 
-// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
+// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
+// b.
+//
+//   r := a & b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_s32( vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)) );
+    return vreinterpretq_m128i_s32(
+            vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// Computes the bitwise AND of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
+// Computes the bitwise AND of the four single-precision, floating-point values
+// of a and b.
+//
+//   r0 := a0 & b0
+//   r1 := a1 & b1
+//   r2 := a2 & b2
+//   r3 := a3 & b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
 {
-	return vreinterpretq_m128_s32( vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)) );
+    return vreinterpretq_m128_s32(
+            vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
 }
 
-// Computes the bitwise OR of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
+// Computes the bitwise OR of the four single-precision, floating-point values
+// of a and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
 {
-	return vreinterpretq_m128_s32( vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)) );
+    return vreinterpretq_m128_s32(
+            vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
 }
 
-// Computes bitwise EXOR (exclusive-or) of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
+// Computes bitwise EXOR (exclusive-or) of the four single-precision,
+// floating-point values of a and b.
+// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
 {
-	return vreinterpretq_m128_s32( veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)) );
+    return vreinterpretq_m128_s32(
+            veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
 }
 
-// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
+// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
+//
+//   r := a | b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_s32( vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)) );
+    return vreinterpretq_m128i_s32(
+            vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
+// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
+// b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_s32( veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)) );
+    return vreinterpretq_m128i_s32(
+            veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// NEON does not provide this method
-// Creates a 4-bit mask from the most significant bits of the four single-precision, floating-point values. https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
-FORCE_INLINE int _mm_movemask_ps(__m128 a)
+// Moves the upper two values of B into the lower two values of A.
+//
+//   r3 := a3
+//   r2 := a2
+//   r1 := b3
+//   r0 := b2
+FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
 {
-#if ENABLE_CPP_VERSION // I am not yet convinced that the NEON version is faster than the C version of this
-	uint32x4_t &ia = *(uint32x4_t *)&a;
-	return (ia[0] >> 31) | ((ia[1] >> 30) & 2) | ((ia[2] >> 29) & 4) | ((ia[3] >> 28) & 8);
-#else
-	static const uint32x4_t movemask = { 1, 2, 4, 8 };
-	static const uint32x4_t highbit = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
-	uint32x4_t t0 = vreinterpretq_u32_m128(a);
-	uint32x4_t t1 = vtstq_u32(t0, highbit);
-	uint32x4_t t2 = vandq_u32(t1, movemask);
-	uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
-	return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
-#endif
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
+    return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
+}
+
+// Moves the lower two values of B into the upper two values of A.
+//
+//   r3 := b1
+//   r2 := b0
+//   r1 := a1
+//   r0 := a0
+FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
+}
+
+FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
+{
+    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
+}
+
+FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
+{
+    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
 }
 
 // Takes the upper 64 bits of a and places it in the low end of the result
 // Takes the lower 64 bits of b and places it into the high end of the result.
 FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
 {
-	float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-	float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-	return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
 }
 
-// takes the lower two 32-bit values from a and swaps them and places in high end of result
-// takes the higher two 32 bit values from b and swaps them and places in low end of result.
+// takes the lower two 32-bit values from a and swaps them and places in high
+// end of result takes the higher two 32 bit values from b and swaps them and
+// places in low end of result.
 FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
 {
-	float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-	float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
-	return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
 }
 
 FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
 {
-	float32x2_t a21 = vget_high_f32(vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
-	float32x2_t b03 = vget_low_f32(vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
-	return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
+    float32x2_t a21 = vget_high_f32(
+            vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b03 = vget_low_f32(
+            vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
 }
 
 FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
 {
-	float32x2_t a03 = vget_low_f32(vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
-	float32x2_t b21 = vget_high_f32(vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
-	return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
+    float32x2_t a03 = vget_low_f32(
+            vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b21 = vget_high_f32(
+            vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
 }
 
 FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
 {
-	float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-	float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-	return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
 }
 
 FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
 {
-	float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-	float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-	return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
 }
 
 FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
 {
-	float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-	float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
-	return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
 }
 
-// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the high
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
+// high
 FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
 {
-	float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-	float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-	return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
 }
 
 FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
 {
-	float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
-	float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-	return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
+    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
 }
 
 FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
 {
-	float32x2_t a22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
-	float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-	return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
+    float32x2_t a22 =
+            vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
 }
 
 FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
 {
-	float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
-	float32x2_t b22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
-	return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
+    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b22 =
+            vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
 }
 
 FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
 {
-	float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-	float32x2_t a22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
-	float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* apoty: TODO: use vzip ?*/
-	float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-	return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32x2_t a22 =
+            vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
 }
 
 FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
 {
-	float32x2_t a33 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
-	float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
-	return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
+    float32x2_t a33 =
+            vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
 }
 
 FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
 {
-	float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-	float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
-	float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-	float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-	return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
 }
 
 FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
 {
-	float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-	float32_t b2 = vgetq_lane_f32(b, 2);
-	float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-	float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-	return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
 }
 
 FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
 {
-	float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-	float32_t b2 = vgetq_lane_f32(b, 2);
-	float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-	float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-	return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
 }
 
 // NEON does not support a general purpose permute intrinsic
-// Currently I am not sure whether the C implementation is faster or slower than the NEON version.
-// Note, this has to be expanded as a template because the shuffle value must be an immediate value.
-// The same is true on SSE as well.
-// Selects four specific single-precision, floating-point values from a and b, based on the mask i. https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
-#if ENABLE_CPP_VERSION // I am not convinced that the NEON version is faster than the C version yet.
-FORCE_INLINE __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, __constrange(0,255) int imm)
-{
-	__m128 ret;
-	ret[0] = a[imm & 0x3];
-	ret[1] = a[(imm >> 2) & 0x3];
-	ret[2] = b[(imm >> 4) & 0x03];
-	ret[3] = b[(imm >> 6) & 0x03];
-	return ret;
+// Selects four specific single-precision, floating-point values from a and b,
+// based on the mask i.
+// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
+#if 0 /* C version */
+FORCE_INLINE __m128 _mm_shuffle_ps_default(__m128 a,
+                                           __m128 b,
+                                           __constrange(0, 255) int imm)
+{
+    __m128 ret;
+    ret[0] = a[imm & 0x3];
+    ret[1] = a[(imm >> 2) & 0x3];
+    ret[2] = b[(imm >> 4) & 0x03];
+    ret[3] = b[(imm >> 6) & 0x03];
+    return ret;
 }
-#else
-#define _mm_shuffle_ps_default(a, b, imm) \
-EXTENSION ({ \
-	float32x4_t ret; \
-	ret = vmovq_n_f32(vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & 0x3)); \
-	ret = vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), ret, 1); \
-	ret = vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), ret, 2); \
-	ret = vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), ret, 3); \
-	vreinterpretq_m128_f32(ret); \
-})
 #endif
-
-//FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255) int imm)
-#define _mm_shuffle_ps(a, b, imm) \
-EXTENSION ({ \
-	__m128 ret; \
-	switch (imm) \
-	{ \
-		case _MM_SHUFFLE(1, 0, 3, 2): ret = _mm_shuffle_ps_1032((a), (b)); break; \
-		case _MM_SHUFFLE(2, 3, 0, 1): ret = _mm_shuffle_ps_2301((a), (b)); break; \
-		case _MM_SHUFFLE(0, 3, 2, 1): ret = _mm_shuffle_ps_0321((a), (b)); break; \
-		case _MM_SHUFFLE(2, 1, 0, 3): ret = _mm_shuffle_ps_2103((a), (b)); break; \
-		case _MM_SHUFFLE(1, 0, 1, 0): ret = _mm_shuffle_ps_1010((a), (b)); break; \
-		case _MM_SHUFFLE(1, 0, 0, 1): ret = _mm_shuffle_ps_1001((a), (b)); break; \
-		case _MM_SHUFFLE(0, 1, 0, 1): ret = _mm_shuffle_ps_0101((a), (b)); break; \
-		case _MM_SHUFFLE(3, 2, 1, 0): ret = _mm_shuffle_ps_3210((a), (b)); break; \
-		case _MM_SHUFFLE(0, 0, 1, 1): ret = _mm_shuffle_ps_0011((a), (b)); break; \
-		case _MM_SHUFFLE(0, 0, 2, 2): ret = _mm_shuffle_ps_0022((a), (b)); break; \
-		case _MM_SHUFFLE(2, 2, 0, 0): ret = _mm_shuffle_ps_2200((a), (b)); break; \
-		case _MM_SHUFFLE(3, 2, 0, 2): ret = _mm_shuffle_ps_3202((a), (b)); break; \
-		case _MM_SHUFFLE(1, 1, 3, 3): ret = _mm_shuffle_ps_1133((a), (b)); break; \
-		case _MM_SHUFFLE(2, 0, 1, 0): ret = _mm_shuffle_ps_2010((a), (b)); break; \
-		case _MM_SHUFFLE(2, 0, 0, 1): ret = _mm_shuffle_ps_2001((a), (b)); break; \
-		case _MM_SHUFFLE(2, 0, 3, 2): ret = _mm_shuffle_ps_2032((a), (b)); break; \
-		default: ret = _mm_shuffle_ps_default((a), (b), (imm)); break; \
-	} \
-	ret; \
-})
+#define _mm_shuffle_ps_default(a, b, imm)                                  \
+    ({                                                                     \
+        float32x4_t ret;                                                   \
+        ret = vmovq_n_f32(                                                 \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) &0x3));        \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                       \
+        vreinterpretq_m128_f32(ret);                                       \
+    })
+
+// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
+// int imm)
+#if defined(__clang__)
+#define _mm_shuffle_ps(a, b, imm)                          \
+    ({                                                     \
+         float32x4_t _input1 = vreinterpretq_f32_m128(a);  \
+         float32x4_t _input2 = vreinterpretq_f32_m128(b);  \
+         float32x4_t _shuf =                               \
+              __builtin_shufflevector(_input1, _input2,    \
+                (imm) & 0x3,                               \
+                ((imm) >> 2) & 0x3,                        \
+                (((imm) >> 4) & 0x3) + 4,                  \
+                (((imm) >> 6) & 0x3) + 4);                 \
+         vreinterpretq_m128_f32(_shuf);                    \
+    })
+#else // generic
+#define _mm_shuffle_ps(a, b, imm)                          \
+    ({                                                     \
+        __m128 ret;                                        \
+        switch (imm) {                                     \
+        case _MM_SHUFFLE(1, 0, 3, 2):                      \
+            ret = _mm_shuffle_ps_1032((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 3, 0, 1):                      \
+            ret = _mm_shuffle_ps_2301((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 3, 2, 1):                      \
+            ret = _mm_shuffle_ps_0321((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 1, 0, 3):                      \
+            ret = _mm_shuffle_ps_2103((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(1, 0, 1, 0):                      \
+            ret = _mm_movelh_ps((a), (b));                 \
+            break;                                         \
+        case _MM_SHUFFLE(1, 0, 0, 1):                      \
+            ret = _mm_shuffle_ps_1001((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 1, 0, 1):                      \
+            ret = _mm_shuffle_ps_0101((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 1, 0):                      \
+            ret = _mm_shuffle_ps_3210((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 0, 1, 1):                      \
+            ret = _mm_shuffle_ps_0011((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 0, 2, 2):                      \
+            ret = _mm_shuffle_ps_0022((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 2, 0, 0):                      \
+            ret = _mm_shuffle_ps_2200((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 0, 2):                      \
+            ret = _mm_shuffle_ps_3202((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 3, 2):                      \
+            ret = _mm_movehl_ps((b), (a));                 \
+            break;                                         \
+        case _MM_SHUFFLE(1, 1, 3, 3):                      \
+            ret = _mm_shuffle_ps_1133((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 1, 0):                      \
+            ret = _mm_shuffle_ps_2010((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 0, 1):                      \
+            ret = _mm_shuffle_ps_2001((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 3, 2):                      \
+            ret = _mm_shuffle_ps_2032((a), (b));           \
+            break;                                         \
+        default:                                           \
+            ret = _mm_shuffle_ps_default((a), (b), (imm)); \
+            break;                                         \
+        }                                                  \
+        ret;                                               \
+    })
+#endif // not clang
 
 // Takes the upper 64 bits of a and places it in the low end of the result
 // Takes the lower 64 bits of a and places it into the high end of the result.
 FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
 {
-	int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
-	int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-	return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
 }
 
-// takes the lower two 32-bit values from a and swaps them and places in low end of result
-// takes the higher two 32 bit values from a and swaps them and places in high end of result.
+// takes the lower two 32-bit values from a and swaps them and places in low end
+// of result takes the higher two 32 bit values from a and swaps them and places
+// in high end of result.
 FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
 {
-	int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-	int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
-	return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
 }
 
-// rotates the least significant 32 bits into the most signficant 32 bits, and shifts the rest down
+// rotates the least significant 32 bits into the most signficant 32 bits, and
+// shifts the rest down
 FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
 {
-	return vreinterpretq_m128i_s32(vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
+    return vreinterpretq_m128i_s32(
+            vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
 }
 
-// rotates the most significant 32 bits into the least signficant 32 bits, and shifts the rest up
+// rotates the most significant 32 bits into the least signficant 32 bits, and
+// shifts the rest up
 FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
 {
-	return vreinterpretq_m128i_s32(vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
+    return vreinterpretq_m128i_s32(
+            vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
 }
 
 // gets the lower 64 bits of a, and places it in the upper 64 bits
 // gets the lower 64 bits of a and places it in the lower 64 bits
 FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
 {
-	int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-	return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
 }
 
-// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the lower 64 bits
-// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
+// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
 FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
 {
-	int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-	int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-	return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
 }
 
-// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the upper 64 bits
-// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the lower 64 bits
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
+// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
+// places it in the lower 64 bits
 FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
 {
-	int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-	return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
 }
 
 FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
 {
-	int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
-	int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
-	return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
+    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
 }
 
 FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
 {
-	int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
-	int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-	return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
 }
 
 FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
 {
-	int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
-	int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
-	return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
 }
 
-//FORCE_INLINE __m128i _mm_shuffle_epi32_default(__m128i a, __constrange(0,255) int imm)
-#if ENABLE_CPP_VERSION
-FORCE_INLINE __m128i _mm_shuffle_epi32_default(__m128i a, __constrange(0,255) int imm)
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8&expand=5146
+FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
 {
-	__m128i ret;
-	ret[0] = a[imm & 0x3];
-	ret[1] = a[(imm >> 2) & 0x3];
-	ret[2] = a[(imm >> 4) & 0x03];
-	ret[3] = a[(imm >> 6) & 0x03];
-	return ret;
-}
+    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
+    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
+    uint8x16_t idx_masked =
+            vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
 #else
-#define _mm_shuffle_epi32_default(a, imm) \
-EXTENSION ({ \
-	int32x4_t ret; \
-	ret = vmovq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & 0x3)); \
-	ret = vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), ret, 1); \
-	ret = vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), ret, 2); \
-	ret = vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), ret, 3); \
-	vreinterpretq_m128i_s32(ret); \
-})
+    // cast is only well-defined in 32-bit mode.
+    int8x8x2_t a_split = *(int8x8x2_t *)&tbl;
+    // use this line if testing on aarch64
+    // int8x8x2_t a_split = { vget_low_s8(tbl), vget_high_s8(tbl) };
+    return vreinterpretq_m128i_s8(
+            vcombine_s8(
+                    vtbl2_s8(a_split, vget_low_u8(idx_masked)),
+                    vtbl2_s8(a_split, vget_high_u8(idx_masked))
+            )
+    );
 #endif
+}
+
 
-//FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255) int imm)
+#if 0 /* C version */
+FORCE_INLINE __m128i _mm_shuffle_epi32_default(__m128i a,
+                                               __constrange(0, 255) int imm)
+{
+    __m128i ret;
+    ret[0] = a[imm & 0x3];
+    ret[1] = a[(imm >> 2) & 0x3];
+    ret[2] = a[(imm >> 4) & 0x03];
+    ret[3] = a[(imm >> 6) & 0x03];
+    return ret;
+}
+#endif
+#define _mm_shuffle_epi32_default(a, imm)                                   \
+    ({                                                                      \
+        int32x4_t ret;                                                      \
+        ret = vmovq_n_s32(                                                  \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) &0x3));        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                        \
+        vreinterpretq_m128i_s32(ret);                                       \
+    })
+
+// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
+// int imm)
 #if defined(__aarch64__)
-#define _mm_shuffle_epi32_splat(a, imm) \
-EXTENSION ({ \
-	vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
-})
+#define _mm_shuffle_epi32_splat(a, imm)                          \
+    ({                                                           \
+        vreinterpretq_m128i_s32(                                 \
+            vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
+    })
 #else
-#define _mm_shuffle_epi32_splat(a, imm) \
-EXTENSION ({ \
-	vreinterpretq_m128i_s32(vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
-})
+#define _mm_shuffle_epi32_splat(a, imm)                                      \
+    ({                                                                       \
+        vreinterpretq_m128i_s32(                                             \
+            vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
+    })
 #endif
 
-// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.	https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
-//FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_shuffle_epi32(a, imm) \
-EXTENSION ({ \
-	__m128i ret; \
-	switch (imm) \
-	{ \
-		case _MM_SHUFFLE(1, 0, 3, 2): ret = _mm_shuffle_epi_1032((a)); break; \
-		case _MM_SHUFFLE(2, 3, 0, 1): ret = _mm_shuffle_epi_2301((a)); break; \
-		case _MM_SHUFFLE(0, 3, 2, 1): ret = _mm_shuffle_epi_0321((a)); break; \
-		case _MM_SHUFFLE(2, 1, 0, 3): ret = _mm_shuffle_epi_2103((a)); break; \
-		case _MM_SHUFFLE(1, 0, 1, 0): ret = _mm_shuffle_epi_1010((a)); break; \
-		case _MM_SHUFFLE(1, 0, 0, 1): ret = _mm_shuffle_epi_1001((a)); break; \
-		case _MM_SHUFFLE(0, 1, 0, 1): ret = _mm_shuffle_epi_0101((a)); break; \
-		case _MM_SHUFFLE(2, 2, 1, 1): ret = _mm_shuffle_epi_2211((a)); break; \
-		case _MM_SHUFFLE(0, 1, 2, 2): ret = _mm_shuffle_epi_0122((a)); break; \
-		case _MM_SHUFFLE(3, 3, 3, 2): ret = _mm_shuffle_epi_3332((a)); break; \
-		case _MM_SHUFFLE(0, 0, 0, 0): ret = _mm_shuffle_epi32_splat((a),0); break; \
-		case _MM_SHUFFLE(1, 1, 1, 1): ret = _mm_shuffle_epi32_splat((a),1); break; \
-		case _MM_SHUFFLE(2, 2, 2, 2): ret = _mm_shuffle_epi32_splat((a),2); break; \
-		case _MM_SHUFFLE(3, 3, 3, 3): ret = _mm_shuffle_epi32_splat((a),3); break; \
-		default: ret = _mm_shuffle_epi32_default((a), (imm)); break; \
-	} \
-	ret; \
-})
-
-// Shuffles the upper 4 signed or unsigned 16 - bit integers in a as specified by imm.  https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
-//FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a, __constrange(0,255) int imm)
-#define _mm_shufflehi_epi16_function(a, imm) \
-EXTENSION ({ \
-	int16x8_t ret = vreinterpretq_s16_s32(a); \
-	int16x4_t highBits = vget_high_s16(ret); \
-	ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & 0x3), ret, 4); \
-	ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, 5); \
-	ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, 6); \
-	ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, 7); \
-	vreinterpretq_s32_s16(ret); \
-})
-
-//FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, __constrange(0,255) int imm)
-#define _mm_shufflehi_epi16(a, imm) \
-	_mm_shufflehi_epi16_function((a), (imm))
-
-
-//added by hasindu
-//Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while shifting in zeros.	https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
-#define _mm_slli_epi16(a, imm) \
-EXTENSION ({ \
-	__m128i ret; \
-	if ((imm) <= 0) {\
-		ret = a; \
-	} \
-	else if ((imm) > 31) { \
-		ret = _mm_setzero_si128(); \
-	} \
-	else { \
-		ret = vreinterpretq_m128i_s16(vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
-	} \
-	ret; \
-})
-
-
-
-// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while shifting in zeros. : https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
-//FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_slli_epi32(a, imm) \
-EXTENSION ({ \
-	__m128i ret; \
-	if ((imm) <= 0) {\
-		ret = a; \
-	} \
-	else if ((imm) > 31) { \
-		ret = _mm_setzero_si128(); \
-	} \
-	else { \
-		ret = vreinterpretq_m128i_s32(vshlq_n_s32(vreinterpretq_s32_m128i(a), (imm))); \
-	} \
-	ret; \
-})
-
-
-//added by hasindu
-// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits while shifting in zeros.
-//https://msdn.microsoft.com/en-us/library/6tcwd38t(v=vs.90).aspx
-#define _mm_srli_epi16(a, imm) \
-EXTENSION ({ \
-	__m128i ret; \
-	if ((imm) <= 0) { \
-		ret = a; \
-	} \
-	else if ((imm)> 31) { \
-		ret = _mm_setzero_si128(); \
-	} \
-	else { \
-		ret = vreinterpretq_m128i_u16(vshrq_n_u16(vreinterpretq_u16_m128i(a), (imm))); \
-	} \
-	ret; \
-})
-
-
-//Shifts the 4 signed or unsigned 32-bit integers in a right by count bits while shifting in zeros.  https://msdn.microsoft.com/en-us/library/w486zcfa(v=vs.100).aspx
-//FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srli_epi32(a, imm) \
-EXTENSION ({ \
-	__m128i ret; \
-	if ((imm) <= 0) { \
-		ret = a; \
-	} \
-	else if ((imm)> 31) { \
-		ret = _mm_setzero_si128(); \
-	} \
-	else { \
-		ret = vreinterpretq_m128i_u32(vshrq_n_u32(vreinterpretq_u32_m128i(a), (imm))); \
-	} \
-	ret; \
-})
-
-// Shifts the 4 signed 32 - bit integers in a right by count bits while shifting in the sign bit.  https://msdn.microsoft.com/en-us/library/z1939387(v=vs.100).aspx
-//FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srai_epi32(a, imm) \
-EXTENSION ({ \
-	__m128i ret; \
-	if ((imm) <= 0) { \
-		ret = a; \
-	} \
-	else if ((imm) > 31) { \
-		ret = vreinterpretq_m128i_s32(vshrq_n_s32(vreinterpretq_s32_m128i(a), 16)); \
-		ret = vreinterpretq_m128i_s32(vshrq_n_s32(vreinterpretq_s32_m128i(ret), 16)); \
-	} \
-	else { \
-		ret = vreinterpretq_m128i_s32(vshrq_n_s32(vreinterpretq_s32_m128i(a), (imm))); \
-	} \
-	ret; \
-})
-
-// Shifts the 128 - bit value in a right by imm bytes while shifting in zeros.imm must be an immediate. https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
-//FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
-#define _mm_srli_si128(a, imm) \
-EXTENSION ({ \
-	__m128i ret; \
-	if ((imm) <= 0) { \
-		ret = a; \
-	} \
-	else if ((imm) > 15) { \
-		ret = _mm_setzero_si128(); \
-	} \
-	else { \
-		ret = vreinterpretq_m128i_s8(vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
-	} \
-	ret; \
-})
-
-// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm must be an immediate.  https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
-//FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
-#define _mm_slli_si128(a, imm) \
-EXTENSION ({ \
-	__m128i ret; \
-	if ((imm) <= 0) { \
-		ret = a; \
-	} \
-	else if ((imm) > 15) { \
-		ret = _mm_setzero_si128(); \
-	} \
-	else { \
-		ret = vreinterpretq_m128i_s8(vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
-	} \
-	ret; \
-})
-
-// NEON does not provide a version of this function, here is an article about some ways to repro the results.
-// http://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
-// Creates a 16-bit mask from the most significant bits of the 16 signed or unsigned 8-bit integers in a and zero extends the upper bits. https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
-FORCE_INLINE int _mm_movemask_epi8(__m128i _a)
-{
-	uint8x16_t input = vreinterpretq_u8_m128i(_a);
-	static const int8_t __attribute__((aligned(16))) xr[8] = { -7, -6, -5, -4, -3, -2, -1, 0 };
-	uint8x8_t mask_and = vdup_n_u8(0x80);
-	int8x8_t mask_shift = vld1_s8(xr);
-
-	uint8x8_t lo = vget_low_u8(input);
-	uint8x8_t hi = vget_high_u8(input);
-
-	lo = vand_u8(lo, mask_and);
-	lo = vshl_u8(lo, mask_shift);
-
-	hi = vand_u8(hi, mask_and);
-	hi = vshl_u8(hi, mask_shift);
-
-	lo = vpadd_u8(lo, lo);
-	lo = vpadd_u8(lo, lo);
-	lo = vpadd_u8(lo, lo);
-
-	hi = vpadd_u8(hi, hi);
-	hi = vpadd_u8(hi, hi);
-	hi = vpadd_u8(hi, hi);
-
-	return ((hi[0] << 8) | (lo[0] & 0xFF));
+// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
+// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
+// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, __constrange(0,255) int
+// imm)
+#if defined(__clang__)
+#define _mm_shuffle_epi32(a, imm)                        \
+    ({                                                   \
+         int32x4_t _input = vreinterpretq_s32_m128i(a);  \
+         int32x4_t _shuf =                               \
+              __builtin_shufflevector(_input, _input,    \
+                (imm) & 0x3,        ((imm) >> 2) & 0x3,  \
+                ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
+         vreinterpretq_m128i_s32(_shuf);                 \
+    })
+#else // generic
+#define _mm_shuffle_epi32(a, imm)                        \
+    ({                                                   \
+        __m128i ret;                                     \
+        switch (imm) {                                   \
+        case _MM_SHUFFLE(1, 0, 3, 2):                    \
+            ret = _mm_shuffle_epi_1032((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 3, 0, 1):                    \
+            ret = _mm_shuffle_epi_2301((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 3, 2, 1):                    \
+            ret = _mm_shuffle_epi_0321((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 1, 0, 3):                    \
+            ret = _mm_shuffle_epi_2103((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 1, 0):                    \
+            ret = _mm_shuffle_epi_1010((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 0, 1):                    \
+            ret = _mm_shuffle_epi_1001((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 0, 1):                    \
+            ret = _mm_shuffle_epi_0101((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 1, 1):                    \
+            ret = _mm_shuffle_epi_2211((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 2, 2):                    \
+            ret = _mm_shuffle_epi_0122((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 2):                    \
+            ret = _mm_shuffle_epi_3332((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 0, 0, 0):                    \
+            ret = _mm_shuffle_epi32_splat((a), 0);       \
+            break;                                       \
+        case _MM_SHUFFLE(1, 1, 1, 1):                    \
+            ret = _mm_shuffle_epi32_splat((a), 1);       \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 2, 2):                    \
+            ret = _mm_shuffle_epi32_splat((a), 2);       \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 3):                    \
+            ret = _mm_shuffle_epi32_splat((a), 3);       \
+            break;                                       \
+        default:                                         \
+            ret = _mm_shuffle_epi32_default((a), (imm)); \
+            break;                                       \
+        }                                                \
+        ret;                                             \
+    })
+#endif // not clang
+
+// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
+// by imm.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
+// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
+// __constrange(0,255) int imm)
+
+#define _mm_shufflelo_epi16_function(a, imm)                                  \
+    ({                                                                        \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
+        int16x4_t lowBits = vget_low_s16(ret);                                \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) &0x3), ret, 0);     \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
+                             1);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
+                             2);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
+                             3);                                              \
+        vreinterpretq_m128i_s16(ret);                                         \
+    })
+
+// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a, __constrange(0,255) int
+// imm)
+#if defined(__clang__)
+#define _mm_shufflelo_epi16(a, imm)                      \
+    ({                                                   \
+         int16x8_t _input = vreinterpretq_s16_m128i(a);  \
+         int16x8_t _shuf =                               \
+              __builtin_shufflevector(_input, _input,    \
+                ((imm) & 0x3),                           \
+                (((imm) >> 2) & 0x3),                    \
+                (((imm) >> 4) & 0x3),                    \
+                (((imm) >> 6) & 0x3),                    \
+                4, 5, 6, 7);                             \
+         vreinterpretq_m128i_s16(_shuf);                 \
+    })
+#else // generic
+#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
+#endif
+
+// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
+// by imm.
+// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
+// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
+// __constrange(0,255) int imm)
+#define _mm_shufflehi_epi16_function(a, imm)                                   \
+    ({                                                                         \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
+        int16x4_t highBits = vget_high_s16(ret);                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) &0x3), ret, 4);     \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
+                             5);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
+                             6);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
+                             7);                                               \
+        vreinterpretq_m128i_s16(ret);                                          \
+    })
+
+// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, __constrange(0,255) int
+// imm)
+#if defined(__clang__)
+#define _mm_shufflehi_epi16(a, imm)                      \
+    ({                                                   \
+         int16x8_t _input = vreinterpretq_s16_m128i(a);  \
+         int16x8_t _shuf =                               \
+              __builtin_shufflevector(_input, _input,    \
+                0, 1, 2, 3,                              \
+                ((imm) & 0x3) + 4,                       \
+                (((imm) >> 2) & 0x3) + 4,                \
+                (((imm) >> 4) & 0x3) + 4,                \
+                (((imm) >> 6) & 0x3) + 4);               \
+         vreinterpretq_m128i_s16(_shuf);                 \
+    })
+#else // generic
+#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
+#endif
+
+// Blend packed 16-bit integers from a and b using control mask imm8, and store
+// the results in dst.
+//
+//   FOR j := 0 to 7
+//       i := j*16
+//       IF imm8[j]
+//           dst[i+15:i] := b[i+15:i]
+//       ELSE
+//           dst[i+15:i] := a[i+15:i]
+//       FI
+//   ENDFOR
+// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b, __constrange(0,255)
+// int imm)
+#define _mm_blend_epi16(a, b, imm)                       \
+    ({                                                   \
+        const uint16_t _mask[8] = {                      \
+            ((imm) & (1 << 0)) ? 0xFFFF : 0x0000,        \
+            ((imm) & (1 << 1)) ? 0xFFFF : 0x0000,        \
+            ((imm) & (1 << 2)) ? 0xFFFF : 0x0000,        \
+            ((imm) & (1 << 3)) ? 0xFFFF : 0x0000,        \
+            ((imm) & (1 << 4)) ? 0xFFFF : 0x0000,        \
+            ((imm) & (1 << 5)) ? 0xFFFF : 0x0000,        \
+            ((imm) & (1 << 6)) ? 0xFFFF : 0x0000,        \
+            ((imm) & (1 << 7)) ? 0xFFFF : 0x0000         \
+        };                                               \
+        uint16x8_t _mask_vec = vld1q_u16(_mask);         \
+        uint16x8_t _a = vreinterpretq_u16_m128i(a);      \
+        uint16x8_t _b = vreinterpretq_u16_m128i(b);      \
+        vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \
+    })
+
+// Blend packed 8-bit integers from a and b using mask, and store the results in dst.
+//
+//   FOR j := 0 to 15
+//       i := j*8
+//       IF mask[i+7]
+//           dst[i+7:i] := b[i+7:i]
+//       ELSE
+//           dst[i+7:i] := a[i+7:i]
+//       FI
+//   ENDFOR
+FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint8x16_t mask = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    uint8x16_t b = vreinterpretq_u8_m128i(_b);
+    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
+}
+
+/////////////////////////////////////
+// Shifts
+/////////////////////////////////////
+
+// Shifts the 4 signed 32-bit integers in a right by count bits while shifting
+// in the sign bit.
+//
+//   r0 := a0 >> count
+//   r1 := a1 >> count
+//   r2 := a2 >> count
+//   r3 := a3 >> count immediate
+FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, int count)
+{
+    return (__m128i) vshlq_s32((int32x4_t) a, vdupq_n_s32(-count));
+}
+
+// Shifts the 8 signed 16-bit integers in a right by count bits while shifting
+// in the sign bit.
+//
+//   r0 := a0 >> count
+//   r1 := a1 >> count
+//   ...
+//   r7 := a7 >> count
+FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int count)
+{
+    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
 }
 
+// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
+// shifting in zeros.
+//
+//   r0 := a0 << count
+//   r1 := a1 << count
+//   ...
+//   r7 := a7 << count
+//
+// https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
+#define _mm_slli_epi16(a, imm)                                   \
+    ({                                                           \
+        __m128i ret;                                             \
+        if ((imm) <= 0) {                                        \
+            ret = a;                                             \
+        } else if ((imm) > 31) {                                 \
+            ret = _mm_setzero_si128();                           \
+        } else {                                                 \
+            ret = vreinterpretq_m128i_s16(                       \
+                vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
+        }                                                        \
+        ret;                                                     \
+    })
+
+// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
+// shifting in zeros. :
+// https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
+// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_slli_epi32(a, imm)                                   \
+    ({                                                           \
+        __m128i ret;                                             \
+        if ((imm) <= 0) {                                        \
+            ret = a;                                             \
+        } else if ((imm) > 31) {                                 \
+            ret = _mm_setzero_si128();                           \
+        } else {                                                 \
+            ret = vreinterpretq_m128i_s32(                       \
+                vshlq_n_s32(vreinterpretq_s32_m128i(a), (imm))); \
+        }                                                        \
+        ret;                                                     \
+    })
+
+// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+#define _mm_slli_epi64(a, imm)                                   \
+    ({                                                           \
+        __m128i ret;                                             \
+        if ((imm) <= 0) {                                        \
+            ret = a;                                             \
+        } else if ((imm) > 63) {                                 \
+            ret = _mm_setzero_si128();                           \
+        } else {                                                 \
+            ret = vreinterpretq_m128i_s64(                       \
+                vshlq_n_s64(vreinterpretq_s64_m128i(a), (imm))); \
+        }                                                        \
+        ret;                                                     \
+    })
+
+// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
+// while shifting in zeros.
+//
+//   r0 := srl(a0, count)
+//   r1 := srl(a1, count)
+//   ...
+//   r7 := srl(a7, count)
+//
+// https://msdn.microsoft.com/en-us/library/6tcwd38t(v=vs.90).aspx
+#define _mm_srli_epi16(a, imm)                                   \
+    ({                                                           \
+        __m128i ret;                                             \
+        if ((imm) <= 0) {                                        \
+            ret = a;                                             \
+        } else if ((imm) > 31) {                                 \
+            ret = _mm_setzero_si128();                           \
+        } else {                                                 \
+            ret = vreinterpretq_m128i_u16(                       \
+                vshrq_n_u16(vreinterpretq_u16_m128i(a), (imm))); \
+        }                                                        \
+        ret;                                                     \
+    })
+
+// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
+// while shifting in zeros.
+// https://msdn.microsoft.com/en-us/library/w486zcfa(v=vs.100).aspx FORCE_INLINE
+// __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_epi32(a, imm)                                   \
+    ({                                                           \
+        __m128i ret;                                             \
+        if ((imm) <= 0) {                                        \
+            ret = a;                                             \
+        } else if ((imm) > 31) {                                 \
+            ret = _mm_setzero_si128();                           \
+        } else {                                                 \
+            ret = vreinterpretq_m128i_u32(                       \
+                vshrq_n_u32(vreinterpretq_u32_m128i(a), (imm))); \
+        }                                                        \
+        ret;                                                     \
+    })
+
+// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+#define _mm_srli_epi64(a, imm)                                   \
+    ({                                                           \
+        __m128i ret;                                             \
+        if ((imm) <= 0) {                                        \
+            ret = a;                                             \
+        } else if ((imm) > 63) {                                 \
+            ret = _mm_setzero_si128();                           \
+        } else {                                                 \
+            ret = vreinterpretq_m128i_u64(                       \
+                vshrq_n_u64(vreinterpretq_u64_m128i(a), (imm))); \
+        }                                                        \
+        ret;                                                     \
+    })
+
+// Shifts the 4 signed 32 - bit integers in a right by count bits while shifting
+// in the sign bit.
+// https://msdn.microsoft.com/en-us/library/z1939387(v=vs.100).aspx
+// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srai_epi32(a, imm)                                   \
+    ({                                                           \
+        __m128i ret;                                             \
+        if ((imm) <= 0) {                                        \
+            ret = a;                                             \
+        } else if ((imm) > 31) {                                 \
+            ret = vreinterpretq_m128i_s32(                       \
+                vshrq_n_s32(vreinterpretq_s32_m128i(a), 16));    \
+            ret = vreinterpretq_m128i_s32(                       \
+                vshrq_n_s32(vreinterpretq_s32_m128i(ret), 16));  \
+        } else {                                                 \
+            ret = vreinterpretq_m128i_s32(                       \
+                vshrq_n_s32(vreinterpretq_s32_m128i(a), (imm))); \
+        }                                                        \
+        ret;                                                     \
+    })
+
+// Shifts the 128 - bit value in a right by imm bytes while shifting in
+// zeros.imm must be an immediate.
+//
+//   r := srl(a, imm*8)
+//
+// https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
+// FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_si128(a, imm)                                              \
+    ({                                                                      \
+        __m128i ret;                                                        \
+        if ((imm) <= 0) {                                                   \
+            ret = a;                                                        \
+        } else if ((imm) > 15) {                                            \
+            ret = _mm_setzero_si128();                                      \
+        } else {                                                            \
+            ret = vreinterpretq_m128i_s8(                                   \
+                vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
+        }                                                                   \
+        ret;                                                                \
+    })
+
+// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
+// must be an immediate.
+//
+//   r := a << (imm * 8)
+//
+// https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
+// FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
+#define _mm_slli_si128(a, imm)                                          \
+    ({                                                                  \
+        __m128i ret;                                                    \
+        if ((imm) <= 0) {                                               \
+            ret = a;                                                    \
+        } else if ((imm) > 15) {                                        \
+            ret = _mm_setzero_si128();                                  \
+        } else {                                                        \
+            ret = vreinterpretq_m128i_s8(vextq_s8(                      \
+                vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
+        }                                                               \
+        ret;                                                            \
+    })
+
+
+// NEON does not provide a version of this function.
+// Creates a 16-bit mask from the most significant bits of the 16 signed or
+// unsigned 8-bit integers in a and zero extends the upper bits.
+// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_epi8(__m128i a)
+{
+    // Use increasingly wide shifts+adds to collect the sign bits
+    // together.
+    // Since the widening shifts would be rather confusing to follow in little endian, everything
+    // will be illustrated in big endian order instead. This has a different result - the bits
+    // would actually be reversed on a big endian machine.
+
+    // Starting input (only half the elements are shown):
+    // 89 ff 1d c0 00 10 99 33
+    uint8x16_t input = vreinterpretq_u8_m128i(a);
+
+    // Shift out everything but the sign bits with an unsigned shift right.
+    //
+    // Bytes of the vector::
+    // 89 ff 1d c0 00 10 99 33
+    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
+    //  |  |  |  |  |  |  |  |
+    // 01 01 00 01 00 00 01 00
+    //
+    // Bits of first important lane(s):
+    // 10001001 (89)
+    // \______
+    //        |
+    // 00000001 (01)
+    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
+
+    // Merge the even lanes together with a 16-bit unsigned shift right + add.
+    // 'xx' represents garbage data which will be ignored in the final result.
+    // In the important bytes, the add functions like a binary OR.
+    //
+    // 01 01 00 01 00 00 01 00
+    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
+    //    \|    \|    \|    \|
+    // xx 03 xx 01 xx 00 xx 02
+    //
+    // 00000001 00000001 (01 01)
+    //        \_______ |
+    //                \|
+    // xxxxxxxx xxxxxx11 (xx 03)
+    uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+
+    // Repeat with a wider 32-bit shift + add.
+    // xx 03 xx 01 xx 00 xx 02
+    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >> 14))
+    //          \|          \|
+    // xx xx xx 0d xx xx xx 02
+    //
+    // 00000011 00000001 (03 01)
+    //        \\_____ ||
+    //         '----.\||
+    // xxxxxxxx xxxx1101 (xx 0d)
+    uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+
+    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit lanes.
+    // xx xx xx 0d xx xx xx 02
+    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >> 28))
+    //                      \|
+    // xx xx xx xx xx xx xx d2
+    //
+    // 00001101 00000010 (0d 02)
+    //     \   \___ |  |
+    //      '---.  \|  |
+    // xxxxxxxx 11010010 (xx d2)
+    uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+
+    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
+    // xx xx xx xx xx xx xx d2
+    //                      ||  return paired64[0]
+    //                      d2
+    // Note: Little endian would return the correct value 4b (01001011) instead.
+    return vgetq_lane_u8(paired64, 0) | ((int)vgetq_lane_u8(paired64, 8) << 8);
+}
+
+// NEON does not provide this method
+// Creates a 4-bit mask from the most significant bits of the four
+// single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_ps(__m128 a)
+{
+    // Uses the exact same method as _mm_movemask_epi8, see that for details
+    uint32x4_t input = vreinterpretq_u32_m128(a);
+    // Shift out everything but the sign bits with a 32-bit unsigned shift right.
+    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
+    // Merge the two pairs together with a 64-bit unsigned shift right + add.
+    uint8x16_t paired = vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
+    // Extract the result.
+    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and return 1 if the result is zero, otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros&expand=5871
+FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
+{
+    int64x2_t a_and_mask =
+            vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
+    return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0
+                                                                           : 1;
+}
 
 // ******************************************
 // Math operations
 // ******************************************
 
-// Subtracts the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
+// Subtracts the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 - b0
+//   r1 := a1 - b1
+//   r2 := a2 - b2
+//   r3 := a3 - b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
 {
-	return vreinterpretq_m128_f32(vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(
+            vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
+// and store the results in dst.
+//    r0 := a0 - b0
+//    r1 := a1 - b1
+FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+            vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
 }
 
-// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or unsigned 32-bit integers of a. https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
+// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
+// unsigned 32-bit integers of a.
+//
+//   r0 := a0 - b0
+//   r1 := a1 - b1
+//   r2 := a2 - b2
+//   r3 := a3 - b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128_f32(vsubq_s32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128i_s32(
+            vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
 FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_s16(vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+    return vreinterpretq_m128i_s16(
+            vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 }
 
-//added by hasindu
 FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_s8(vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+    return vreinterpretq_m128i_s8(
+            vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
 }
 
-//added by hasindu
-//Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit integers of a and saturates.. https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
+// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
+// integers of a and saturates..
+// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
 FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_u16(vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+    return vreinterpretq_m128i_u16(
+            vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
 }
 
-//added by hasindu
-//Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit integers of a and saturates.. https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
+// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
+// integers of a and saturates.
+//
+//   r0 := UnsignedSaturate(a0 - b0)
+//   r1 := UnsignedSaturate(a1 - b1)
+//   ...
+//   r15 := UnsignedSaturate(a15 - b15)
+//
+// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
 FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_u8(vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+    return vreinterpretq_m128i_u8(
+            vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
+// of a and saturates.
+//
+//   r0 := SignedSaturate(a0 - b0)
+//   r1 := SignedSaturate(a1 - b1)
+//   ...
+//   r7 := SignedSaturate(a7 - b7)
+FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+            vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 }
 
-// Adds the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+            vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed
+// 8-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..15
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
+{
+    int8x16_t a = vreinterpretq_s8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+
+    int8x16_t zero = vdupq_n_s8(0);
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
+    // (b == 0) ? 0xFF : 0
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, zero));
+    // -a
+    int8x16_t neg = vnegq_s8(a);
+    // bitwise select either a or neg based on ltMask
+    int8x16_t masked = vbslq_s8(ltMask, a, neg);
+    // res = masked & (~zeroMask)
+    int8x16_t res = vbicq_s8(masked, zeroMask);
+    return vreinterpretq_m128i_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed
+// 16-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..7
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    int16x8_t zero = vdupq_n_s16(0);
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
+    // (b == 0) ? 0xFFFF : 0
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, zero));
+    // -a
+    int16x8_t neg = vnegq_s16(a);
+    // bitwise select either a or neg based on ltMask
+    int16x8_t masked = vbslq_s16(ltMask, a, neg);
+    // res = masked & (~zeroMask)
+    int16x8_t res = vbicq_s16(masked, zeroMask);
+    return vreinterpretq_m128i_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed
+// 32-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..3
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+
+    int32x4_t zero = vdupq_n_s32(0);
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
+    // (b == 0) ? 0xFFFFFFFF : 0
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, zero));
+    // neg = -a
+    int32x4_t neg = vnegq_s32(a);
+    // bitwise select either a or neg based on ltMask
+    int32x4_t masked = vbslq_s32(ltMask, a, neg);
+    // res = masked & (~zeroMask)
+    int32x4_t res = vbicq_s32(masked, zeroMask);
+    return vreinterpretq_m128i_s32(res);
+}
+
+// Adds the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 + b0
+//   r1 := a1 + b1
+//   r2 := a2 + b2
+//   r3 := a3 + b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
 {
-	return vreinterpretq_m128_f32(vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(
+            vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 }
 
-// adds the scalar single-precision floating point values of a and b.  https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
+// adds the scalar single-precision floating point values of a and b.
+// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
 {
-	float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-	float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
-	//the upper values in the result must be the remnants of <a>.
-	return vreinterpretq_m128_f32(vaddq_f32(a, value));
+    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
+    // the upper values in the result must be the remnants of <a>.
+    return vreinterpretq_m128_f32(vaddq_f32(a, value));
 }
 
-// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or unsigned 32-bit integers in b. https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
+// unsigned 32-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+            vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
+// unsigned 32-bit integers in b.
+//
+//   r0 := a0 + b0
+//   r1 := a1 + b1
+//   r2 := a2 + b2
+//   r3 := a3 + b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_s32(vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+    return vreinterpretq_m128i_s32(
+            vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or unsigned 16-bit integers in b. https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
+// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
+// unsigned 16-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_s16(vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+    return vreinterpretq_m128i_s16(
+            vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 }
 
-//added by hasindu
-// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or unsigned 8-bit integers in b. https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
+// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
+// unsigned 8-bit integers in b.
+// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
 FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_s8(vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+    return vreinterpretq_m128i_s8(
+            vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
 }
 
-//added by hasindu
-// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b and saturates. https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
+// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
+// and saturates.
+//
+//   r0 := SignedSaturate(a0 + b0)
+//   r1 := SignedSaturate(a1 + b1)
+//   ...
+//   r7 := SignedSaturate(a7 + b7)
+//
+// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_s16(vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+    return vreinterpretq_m128i_s16(
+            vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 }
 
-//added by hasindu
-//Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in b and saturates.. https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
+// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
+// b and saturates..
+// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_u8(vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+    return vreinterpretq_m128i_u8(
+            vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
 }
 
-
-// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or unsigned 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
+// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
+// unsigned 16-bit integers from b.
+//
+//   r0 := (a0 * b0)[15:0]
+//   r1 := (a1 * b1)[15:0]
+//   ...
+//   r7 := (a7 * b7)[15:0]
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_s16(vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+    return vreinterpretq_m128i_s16(
+            vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 }
 
-// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or unsigned 32-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
+// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
+// unsigned 32-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_s32(vmulq_s32(vreinterpretq_s32_m128i(a),vreinterpretq_s32_m128i(b)));
+    return vreinterpretq_m128i_s32(
+            vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// Multiplies the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
+// Multiplies the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 * b0
+//   r1 := a1 * b1
+//   r2 := a2 * b2
+//   r3 := a3 * b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
 {
-	return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(
+            vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
+// a and b, and store the unsigned 64-bit results in dst.
+//
+//   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
+//   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
+FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
+{
+    // vmull_u32 upcasts instead of masking, so we downcast.
+    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
+    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
+    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
 }
 
-// Divides the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
+// Multiply the low signed 32-bit integers from each packed 64-bit element in
+// a and b, and store the signed 64-bit results in dst.
+//
+//   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
+//   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
+FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
+{
+    // vmull_s32 upcasts instead of masking, so we downcast.
+    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
+    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
+}
+
+// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
+// integers from b.
+//
+//   r0 := (a0 * b0) + (a1 * b1)
+//   r1 := (a2 * b2) + (a3 * b3)
+//   r2 := (a4 * b4) + (a5 * b5)
+//   r3 := (a6 * b6) + (a7 * b7)
+// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
+{
+    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                              vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                               vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
+    int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
+
+    return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate signed
+// 32-bit integers. Shift right by 15 bits while rounding up, and store the
+// packed 16-bit integers in dst.
+//
+//   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
+//   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
+//   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
+//   ...
+//   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
+FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
+{
+    // Has issues due to saturation
+    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
+
+    // Multiply
+    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    // Rounding narrowing shift right
+    // narrow = (int16_t)((mul + 16384) >> 15);
+    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
+    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
+
+    // Join together
+    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
+// and pack the saturated results in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*16
+//      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
+//   ENDFOR
+FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
+{
+    // This would be much simpler if x86 would choose to zero extend OR sign extend,
+    // not both.
+    // This could probably be optimized better.
+    uint16x8_t a = vreinterpretq_u16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // Zero extend a
+    int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
+    int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
+    int16x8_t b_odd = vshrq_n_s16(b, 8);
+
+    // multiply
+    int16x8_t prod1 = vmulq_s16(a_even, b_even);
+    int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
+}
+
+// Computes the absolute difference of the 16 unsigned 8-bit integers from a
+// and the 16 unsigned 8-bit integers from b.
+//
+// Return Value
+// Sums the upper 8 differences and lower 8 differences and packs the
+// resulting 2 unsigned 16-bit integers into the upper and lower 64-bit
+// elements.
+//
+//   r0 := abs(a0 - b0) + abs(a1 - b1) +...+ abs(a7 - b7)
+//   r1 := 0x0
+//   r2 := 0x0
+//   r3 := 0x0
+//   r4 := abs(a8 - b8) + abs(a9 - b9) +...+ abs(a15 - b15)
+//   r5 := 0x0
+//   r6 := 0x0
+//   r7 := 0x0
+FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
+{
+    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
+    uint16_t r0 = t[0] + t[1] + t[2] + t[3];
+    uint16_t r4 = t[4] + t[5] + t[6] + t[7];
+    uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
+    return (__m128i) vsetq_lane_u16(r4, r, 4);
+}
+
+// Divides the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 / b0
+//   r1 := a1 / b1
+//   r2 := a2 / b2
+//   r3 := a3 / b3
+//
+// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
 {
-	float32x4_t recip0 = vrecpeq_f32(vreinterpretq_f32_m128(b));
-	float32x4_t recip1 = vmulq_f32(recip0, vrecpsq_f32(recip0, vreinterpretq_f32_m128(b)));
-	return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip1));
+    float32x4_t recip0 = vrecpeq_f32(vreinterpretq_f32_m128(b));
+    float32x4_t recip1 =
+            vmulq_f32(recip0, vrecpsq_f32(recip0, vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip1));
 }
 
-// Divides the scalar single-precision floating point value of a by b.  https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
+// Divides the scalar single-precision floating point value of a by b.
+// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
 {
-	float32_t value = vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
-	return vreinterpretq_m128_f32(vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+    float32_t value =
+            vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
+    return vreinterpretq_m128_f32(
+            vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
 }
 
-// This version does additional iterations to improve accuracy.  Between 1 and 4 recommended.
-// Computes the approximations of reciprocals of the four single-precision, floating-point values of a. https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
+// This version does additional iterations to improve accuracy.  Between 1 and 4
+// recommended. Computes the approximations of reciprocals of the four
+// single-precision, floating-point values of a.
+// https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
 FORCE_INLINE __m128 recipq_newton(__m128 in, int n)
 {
-	int i;
-	float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
-	for (i = 0; i < n; ++i)
-	{
-		recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
-	}
-	return vreinterpretq_m128_f32(recip);
+    int i;
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
+    for (i = 0; i < n; ++i) {
+        recip =
+                vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+    }
+    return vreinterpretq_m128_f32(recip);
 }
 
-// Computes the approximations of reciprocals of the four single-precision, floating-point values of a. https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
+// Computes the approximations of reciprocals of the four single-precision,
+// floating-point values of a.
+// https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
 {
-	float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
-	recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
-	return vreinterpretq_m128_f32(recip);
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+    return vreinterpretq_m128_f32(recip);
 }
 
-// Computes the approximations of square roots of the four single-precision, floating-point values of a. First computes reciprocal square roots and then reciprocals of the four values. https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
+// Computes the approximations of square roots of the four single-precision,
+// floating-point values of a. First computes reciprocal square roots and then
+// reciprocals of the four values.
+//
+//   r0 := sqrt(a0)
+//   r1 := sqrt(a1)
+//   r2 := sqrt(a2)
+//   r3 := sqrt(a3)
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
 {
-	float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
-	float32x4_t sq = vrecpeq_f32(recipsq);
-	// ??? use step versions of both sqrt and recip for better accuracy?
-	return vreinterpretq_m128_f32(sq);
+    float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+    float32x4_t sq = vrecpeq_f32(recipsq);
+    // ??? use step versions of both sqrt and recip for better accuracy?
+    return vreinterpretq_m128_f32(sq);
 }
 
-// Computes the approximation of the square root of the scalar single-precision floating point value of in.  https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
+// Computes the approximation of the square root of the scalar single-precision
+// floating point value of in.
+// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
 {
-	float32_t value = vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
-	return vreinterpretq_m128_f32(vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
+    float32_t value =
+            vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
+    return vreinterpretq_m128_f32(
+            vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
 }
 
-// Computes the approximations of the reciprocal square roots of the four single-precision floating point values of in.  https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
+// Computes the approximations of the reciprocal square roots of the four
+// single-precision floating point values of in.
+// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
 {
-	return vreinterpretq_m128_f32(vrsqrteq_f32(vreinterpretq_f32_m128(in)));
+    return vreinterpretq_m128_f32(vrsqrteq_f32(vreinterpretq_f32_m128(in)));
 }
 
-// Computes the maximums of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
+// Computes the maximums of the four single-precision, floating-point values of
+// a and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
 {
-	return vreinterpretq_m128_f32(vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(
+            vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 }
 
-// Computes the minima of the four single-precision, floating-point values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
+// Computes the minima of the four single-precision, floating-point values of a
+// and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
 {
-	return vreinterpretq_m128_f32(vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(
+            vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 }
 
-// Computes the maximum of the two lower scalar single-precision floating point values of a and b.  https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
+// Computes the maximum of the two lower scalar single-precision floating point
+// values of a and b.
+// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
 {
-	float32_t value = vgetq_lane_f32(vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
-	return vreinterpretq_m128_f32(vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+    float32_t value = vgetq_lane_f32(
+            vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(
+            vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
 }
 
-// Computes the minimum of the two lower scalar single-precision floating point values of a and b.  https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
+// Computes the minimum of the two lower scalar single-precision floating point
+// values of a and b.
+// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
 {
-	float32_t value = vgetq_lane_f32(vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
-	return vreinterpretq_m128_f32(vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+    float32_t value = vgetq_lane_f32(
+            vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(
+            vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
 }
 
-//added by hasindu
-//Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the 16 unsigned 8-bit integers from b. https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
+// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
+// 16 unsigned 8-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_u8(vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+    return vreinterpretq_m128i_u8(
+            vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
 }
 
-//added by hasindu
-//Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the 16 unsigned 8-bit integers from b. https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
+// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
+// 16 unsigned 8-bit integers from b.
+// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
 FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_u8(vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+    return vreinterpretq_m128i_u8(
+            vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
 }
 
-
-// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
+// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
+// signed 16-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_s16(vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+    return vreinterpretq_m128i_s16(
+            vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 }
 
-//added by hasindu
-//Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
+// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
+// signed 16-bit integers from b.
+// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_s16(vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+    return vreinterpretq_m128i_s16(
+            vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 }
 
-
 // epi versions of min/max
-// Computes the pariwise maximums of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
+// Computes the pariwise maximums of the four signed 32-bit integer values of a
+// and b.
+//
+// A 128-bit parameter that can be defined with the following equations:
+//   r0 := (a0 > b0) ? a0 : b0
+//   r1 := (a1 > b1) ? a1 : b1
+//   r2 := (a2 > b2) ? a2 : b2
+//   r3 := (a3 > b3) ? a3 : b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_s32(vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+    return vreinterpretq_m128i_s32(
+            vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// Computes the pariwise minima of the four signed 32-bit integer values of a and b. https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
+// Computes the pariwise minima of the four signed 32-bit integer values of a
+// and b.
+//
+// A 128-bit parameter that can be defined with the following equations:
+//   r0 := (a0 < b0) ? a0 : b0
+//   r1 := (a1 < b1) ? a1 : b1
+//   r2 := (a2 < b2) ? a2 : b2
+//   r3 := (a3 < b3) ? a3 : b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_s32(vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+    return vreinterpretq_m128i_s32(
+            vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit integers from b. https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
+// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
+// integers from b.
+//
+//   r0 := (a0 * b0)[31:16]
+//   r1 := (a1 * b1)[31:16]
+//   ...
+//   r7 := (a7 * b7)[31:16]
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
 {
-	/* apoty: issue with large values because of result saturation */
-	//int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)); /* =2*a*b */
-	//return vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
-	int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
-	int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
-	int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
-	int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
-	int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
-	int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
-	uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
-	return vreinterpretq_m128i_u16(r.val[1]);
+    /* FIXME: issue with large values because of result saturation */
+    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
+    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
+    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
+    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
+    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
+    uint16x8x2_t r =
+            vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
 }
 
-// Computes pairwise add of each argument as single-precision, floating-point values a and b.
-//https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
-FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b )
+// FIXME: is this correct?
+FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
+{
+    uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab3210 = vmull_u16(a3210, b3210); /* 3333222211110000 */
+    uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab7654 = vmull_u16(a7654, b7654); /* 7777666655554444 */
+    uint16x8x2_t r =
+            vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+}
+
+// Computes pairwise add of each argument as single-precision, floating-point
+// values a and b.
+// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
+FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
 {
 #if defined(__aarch64__)
-	return vreinterpretq_m128_f32(vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); //AArch64
+    return vreinterpretq_m128_f32(vpaddq_f32(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));  // AArch64
 #else
-	float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-	float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-	float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-	float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-	return vreinterpretq_m128_f32(vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(
+            vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
 #endif
 }
 
+// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
+// values a and b.
+FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_m128i_s16(
+            vcombine_s16(
+                    vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
+                    vpadd_s16(vget_low_s16(b), vget_high_s16(b))
+            )
+    );
+}
+
+// Computes pairwise difference of each argument as a 16-bit signed or unsigned integer
+// values a and b.
+FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Subtract
+    return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
+}
+
+// Computes saturated pairwise sub of each argument as a 16-bit signed
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
+}
+
+// Computes saturated pairwise difference of each argument as a 16-bit signed
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Saturated subtract
+    return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
+}
+
+// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
+// values a and b.
+FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    return vreinterpretq_m128i_s32(
+            vcombine_s32(
+                    vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
+                    vpadd_s32(vget_low_s32(b), vget_high_s32(b))
+            )
+    );
+}
+
+// Computes pairwise difference of each argument as a 32-bit signed or unsigned integer
+// values a and b.
+FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
+{
+    int64x2_t a = vreinterpretq_s64_m128i(_a);
+    int64x2_t b = vreinterpretq_s64_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|b0|b2]
+    // [a1|a2|b1|b3]
+    int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
+    int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
+    // Subtract
+    return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
+}
+
 // ******************************************
 // Compare operations
 // ******************************************
 
-// Compares for less than https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
+// Compares for less than
+// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
 {
-	return vreinterpretq_m128_u32(vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_u32(
+            vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 }
 
-// Compares for greater than. https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
+// Compares for greater than.
+//
+//   r0 := (a0 > b0) ? 0xffffffff : 0x0
+//   r1 := (a1 > b1) ? 0xffffffff : 0x0
+//   r2 := (a2 > b2) ? 0xffffffff : 0x0
+//   r3 := (a3 > b3) ? 0xffffffff : 0x0
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
 {
-	return vreinterpretq_m128_u32(vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_u32(
+            vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 }
 
-// Compares for greater than or equal. https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
+// Compares for greater than or equal.
+// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
 {
-	return vreinterpretq_m128_u32(vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_u32(
+            vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 }
 
-// Compares for less than or equal. https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
+// Compares for less than or equal.
+//
+//   r0 := (a0 <= b0) ? 0xffffffff : 0x0
+//   r1 := (a1 <= b1) ? 0xffffffff : 0x0
+//   r2 := (a2 <= b2) ? 0xffffffff : 0x0
+//   r3 := (a3 <= b3) ? 0xffffffff : 0x0
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
 {
-	return vreinterpretq_m128_u32(vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_u32(
+            vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 }
 
-// Compares for equality. https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
+// Compares for equality.
+// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
 {
-	return vreinterpretq_m128_u32(vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_u32(
+            vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
 }
 
+// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
+// unsigned 8-bit integers in b for equality.
+// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+            vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
 
-//added by hasindu
-//Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or unsigned 8-bit integers in b for equality. https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_cmpeq_epi8 (__m128i a, __m128i b)
+// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
+// unsigned 16-bit integers in b for equality.
+// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_u8(vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+    return vreinterpretq_m128i_u16(
+            vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 }
 
-//added by hasindu
-//Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or unsigned 16-bit integers in b for equality.
-//https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpeq_epi16 (__m128i a, __m128i b)
+// Compare packed 32-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_u16(vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+    return vreinterpretq_m128i_u32(
+            vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-//added by hasindu
-//Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers in b for lesser than. https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_cmplt_epi8 (__m128i a, __m128i b)
+// Compare packed 64-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_u8(vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_u64(
+        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
+#else
+    // ARMv7 lacks vceqq_u64
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
+#endif
 }
 
+// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
+// in b for lesser than.
+// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+            vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
 
-//added by hasindu
-//Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers in b for greater than. https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpgt_epi8 (__m128i a, __m128i b)
+// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
+// in b for greater than.
+//
+//   r0 := (a0 > b0) ? 0xff : 0x0
+//   r1 := (a1 > b1) ? 0xff : 0x0
+//   ...
+//   r15 := (a15 > b15) ? 0xff : 0x0
+//
+// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_u8(vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+    return vreinterpretq_m128i_u8(
+            vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
 }
 
-//added by hasindu
-//Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers in b for greater than. https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpgt_epi16 (__m128i a, __m128i b)
+// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
+// in b for greater than.
+//
+//   r0 := (a0 > b0) ? 0xffff : 0x0
+//   r1 := (a1 > b1) ? 0xffff : 0x0
+//   ...
+//   r7 := (a7 > b7) ? 0xffff : 0x0
+//
+// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_u16(vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+    return vreinterpretq_m128i_u16(
+            vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
 }
 
 
-// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for less than. https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
+// in b for less than.
+// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_u32(vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+    return vreinterpretq_m128i_u32(
+            vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for greater than. https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
+// in b for greater than.
+// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_u32(vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+    return vreinterpretq_m128i_u32(
+            vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
-// Compares the four 32-bit floats in a and b to check if any values are NaN. Ordered compare between each value returns true for "orderable" and false for "not orderable" (NaN). https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx
-// see also:
-// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
-// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
-FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b )
+// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
+// in b for greater than.
+FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
 {
-	// Note: NEON does not have ordered compare builtin
-	// Need to compare a eq a and b eq b to check for NaN
-	// Do AND of results to get final
-	uint32x4_t ceqaa = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-	uint32x4_t ceqbb = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-	return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_u64(
+        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    // ARMv7 lacks vcgtq_s64.
+    // This is based off of Clang's SSE2 polyfill:
+    // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi))
+
+    // Mask the sign bit out since we need a signed AND an unsigned comparison
+    // and it is ugly to try and split them.
+    int32x4_t mask   = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull));
+    int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask);
+    int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask);
+    // Check if a > b
+    int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask));
+    // Copy upper mask to lower mask
+    // a_hi > b_hi
+    int64x2_t gt_hi = vshrq_n_s64(greater, 63);
+    // Copy lower mask to upper mask
+    // a_lo > b_lo
+    int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32);
+    // Compare for equality
+    int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask));
+    // Copy upper mask to lower mask
+    // a_hi == b_hi
+    int64x2_t eq_hi = vshrq_n_s64(equal, 63);
+    // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi)
+    int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi));
+    return vreinterpretq_m128i_s64(ret);
+#endif
 }
-
-// Compares the lower single-precision floating point scalar values of a and b using a less than operation. : https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx
-// Important note!! The documentation on MSDN is incorrect!  If either of the values is a NAN the docs say you will get a one, but in fact, it will return a zero!!
+// Compares the four 32-bit floats in a and b to check if any values are NaN.
+// Ordered compare between each value returns true for "orderable" and false for
+// "not orderable" (NaN).
+// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
+// also:
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
+FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
+{
+    // Note: NEON does not have ordered compare builtin
+    // Need to compare a eq a and b eq b to check for NaN
+    // Do AND of results to get final
+    uint32x4_t ceqaa =
+            vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t ceqbb =
+            vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a less than operation. :
+// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
+// note!! The documentation on MSDN is incorrect!  If either of the values is a
+// NAN the docs say you will get a one, but in fact, it will return a zero!!
 FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
 {
-	uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-	uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-	uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
-	uint32x4_t a_lt_b = vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-	return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0) ? 1 : 0;
+    uint32x4_t a_not_nan =
+            vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+            vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_lt_b =
+            vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0;
 }
 
-// Compares the lower single-precision floating point scalar values of a and b using a greater than operation. : https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
+// Compares the lower single-precision floating point scalar values of a and b
+// using a greater than operation. :
+// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
 FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
 {
-	//return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
-	uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-	uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-	uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-	uint32x4_t a_gt_b = vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-	return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b using a less than or equal operation. : https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
+    // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+            vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+            vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_gt_b =
+            vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a less than or equal operation. :
+// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
 FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
 {
-	//return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
-	uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-	uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-	uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
-	uint32x4_t a_le_b = vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-	return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b using a greater than or equal operation. : https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
+    // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+            vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+            vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_le_b =
+            vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a greater than or equal operation. :
+// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
 FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
 {
-	//return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
-	uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-	uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-	uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-	uint32x4_t a_ge_b = vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-	return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b using an equality operation. : https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
+    // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+            vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+            vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_ge_b =
+            vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using an equality operation. :
+// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
 FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
 {
-	//return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
-	uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-	uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-	uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
-	uint32x4_t a_eq_b = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-	return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b using an inequality operation. : https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
+    // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+            vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+            vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_eq_b =
+            vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using an inequality operation. :
+// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
 FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
 {
-	//return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)), 0);
-	uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-	uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-	uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-	uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-	return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0) ? 1 : 0;
-}
-
-// according to the documentation, these intrinsics behave the same as the non-'u' versions.  We'll just alias them here.
-#define _mm_ucomilt_ss      _mm_comilt_ss
-#define _mm_ucomile_ss      _mm_comile_ss
-#define _mm_ucomigt_ss      _mm_comigt_ss
-#define _mm_ucomige_ss      _mm_comige_ss
-#define _mm_ucomieq_ss      _mm_comieq_ss
-#define _mm_ucomineq_ss     _mm_comineq_ss
+    // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+            vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+            vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
+    uint32x4_t a_neq_b = vmvnq_u32(
+            vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
+}
+
+// according to the documentation, these intrinsics behave the same as the
+// non-'u' versions.  We'll just alias them here.
+#define _mm_ucomilt_ss _mm_comilt_ss
+#define _mm_ucomile_ss _mm_comile_ss
+#define _mm_ucomigt_ss _mm_comigt_ss
+#define _mm_ucomige_ss _mm_comige_ss
+#define _mm_ucomieq_ss _mm_comieq_ss
+#define _mm_ucomineq_ss _mm_comineq_ss
 
 // ******************************************
 // Conversions
 // ******************************************
 
-// Converts the four single-precision, floating-point values of a to signed 32-bit integer values using truncate. https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
+// Converts the four single-precision, floating-point values of a to signed
+// 32-bit integer values using truncate.
+// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
 {
-	return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
 }
 
-// Converts the four signed 32-bit integer values of a to single-precision, floating-point values https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
+// Converts the four signed 32-bit integer values of a to single-precision,
+// floating-point values
+// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
 FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
 {
-	return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
 }
 
-// Converts the four unsigned 8-bit integers in the lower 32 bits to four unsigned 32-bit integers. https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
+// Converts the four unsigned 8-bit integers in the lower 16 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    return vreinterpretq_m128i_u16(u16x8);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 32 bits to four
+// unsigned 32-bit integers.
+// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
 FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
 {
-	uint8x16_t u8x16 = vreinterpretq_u8_s32(a);        /* xxxx xxxx xxxx DCBA */
-	uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
-	uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
-	return vreinterpretq_s32_u32(u32x4);
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_u32(u32x4);
 }
 
-// Converts the four signed 16-bit integers in the lower 64 bits to four signed 32-bit integers. https://msdn.microsoft.com/en-us/library/bb514079%28v=vs.100%29.aspx
+// Converts the two unsigned 8-bit integers in the lower 16 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 16 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    return vreinterpretq_m128i_s16(s16x8);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 32 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_s32(s32x4);
+}
+
+// Converts the two signed 8-bit integers in the lower 32 bits to four
+// signed 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Converts the four signed 16-bit integers in the lower 64 bits to four signed
+// 32-bit integers.
 FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
 {
-	return vreinterpretq_m128i_s32(vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
+    return vreinterpretq_m128i_s32(
+            vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
+}
+
+// Converts the two signed 16-bit integers in the lower 32 bits two signed
+// 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
+{
+    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Converts the four unsigned 16-bit integers in the lower 64 bits to four unsigned
+// 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_u32(
+            vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
+}
+
+// Converts the two unsigned 16-bit integers in the lower 32 bits to two unsigned
+// 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
+{
+    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Converts the two unsigned 32-bit integers in the lower 64 bits to two unsigned
+// 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_u64(
+            vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
 }
 
-// Converts the four single-precision, floating-point values of a to signed 32-bit integer values. https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
-// *NOTE*. The default rounding mode on SSE is 'round to even', which ArmV7 does not support!
-// It is supported on ARMv8 however.
+// Converts the two signed 32-bit integers in the lower 64 bits to two signed
+// 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+            vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
+}
+
+// Converts the four single-precision, floating-point values of a to signed
+// 32-bit integer values.
+//
+//   r0 := (int) a0
+//   r1 := (int) a1
+//   r2 := (int) a2
+//   r3 := (int) a3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
+// *NOTE*. The default rounding mode on SSE is 'round to even', which ArmV7-A
+// does not support! It is supported on ARMv8-A however.
 FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
 {
 #if defined(__aarch64__)
-	return vcvtnq_s32_f32(a);
+    return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
 #else
     uint32x4_t signmask = vdupq_n_u32(0x80000000);
-    float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), vdupq_n_f32(0.5f)); /* +/- 0.5 */
-    int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
-    int32x4_t r_trunc = vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
-    int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
-    int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
-    float32x4_t delta = vsubq_f32(vreinterpretq_f32_m128(a), vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+    float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                                 vdupq_n_f32(0.5f)); /* +/- 0.5 */
+    int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+    int32x4_t r_trunc =
+            vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+    int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+    int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+    float32x4_t delta = vsubq_f32(
+            vreinterpretq_f32_m128(a),
+            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
     uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
     return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
 #endif
 }
 
-// Moves the least significant 32 bits of a to a 32-bit integer. https://msdn.microsoft.com/en-us/library/5z7a9642%28v=vs.90%29.aspx
+// Moves the least significant 32 bits of a to a 32-bit integer.
+// https://msdn.microsoft.com/en-us/library/5z7a9642%28v=vs.90%29.aspx
 FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
 {
-	return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
 }
 
-// Moves 32-bit integer a to the least significant 32 bits of an __m128 object, zero extending the upper bits. https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
+// Extracts the low order 64-bit integer from the parameter.
+// https://msdn.microsoft.com/en-us/library/bb531384(v=vs.120).aspx
+FORCE_INLINE uint64_t _mm_cvtsi128_si64(__m128i a)
+{
+    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
+}
+
+// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
+// zero extending the upper bits.
+//
+//   r0 := a
+//   r1 := 0x0
+//   r2 := 0x0
+//   r3 := 0x0
+//
+// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
 FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
 {
-	return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
+    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
 }
 
+// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
+// zero extending the upper bits.
+//
+//   r0 := a
+//   r1 := 0x0
+FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
+{
+    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
+}
 
-// Applies a type cast to reinterpret four 32-bit floating point values passed in as a 128-bit parameter as packed 32-bit integers. https://msdn.microsoft.com/en-us/library/bb514099.aspx
+// Applies a type cast to reinterpret four 32-bit floating point values passed
+// in as a 128-bit parameter as packed 32-bit integers.
+// https://msdn.microsoft.com/en-us/library/bb514099.aspx
 FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
 {
-	return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
+    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
 }
 
-// Applies a type cast to reinterpret four 32-bit integers passed in as a 128-bit parameter as packed 32-bit floating point values. https://msdn.microsoft.com/en-us/library/bb514029.aspx
+// Applies a type cast to reinterpret four 32-bit integers passed in as a
+// 128-bit parameter as packed 32-bit floating point values.
+// https://msdn.microsoft.com/en-us/library/bb514029.aspx
 FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
 {
-	return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
 }
 
-// Loads 128-bit value. : https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
+// Loads 128-bit value. :
+// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
 FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
 {
-	return vreinterpretq_m128i_s32(vld1q_s32((int32_t *)p));
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
 }
 
-//added by hasindu (verify this for requirement of alignment)
-// Loads 128-bit value. : https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
+// Loads 128-bit value. :
+// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
 FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
 {
-	return vreinterpretq_m128i_s32(vld1q_s32((int32_t *)p));
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
 }
 
+// _mm_lddqu_si128 functions the same as _mm_loadu_si128.
+#define _mm_lddqu_si128 _mm_loadu_si128
 
 // ******************************************
 // Miscellaneous Operations
 // ******************************************
 
-// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and saturates. https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
+// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
+// saturates.
+// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
 FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_s8(vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)), vqmovn_s16(vreinterpretq_s16_m128i(b))));
+    return vreinterpretq_m128i_s8(
+            vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
+                        vqmovn_s16(vreinterpretq_s16_m128i(b))));
 }
 
-// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned integers and saturates. https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
+// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
+// integers and saturates.
+//
+//   r0 := UnsignedSaturate(a0)
+//   r1 := UnsignedSaturate(a1)
+//   ...
+//   r7 := UnsignedSaturate(a7)
+//   r8 := UnsignedSaturate(b0)
+//   r9 := UnsignedSaturate(b1)
+//   ...
+//   r15 := UnsignedSaturate(b7)
+//
+// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
 {
-	return vreinterpretq_m128i_u8(vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)), vqmovun_s16(vreinterpretq_s16_m128i(b))));
+    return vreinterpretq_m128i_u8(
+            vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
+                        vqmovun_s16(vreinterpretq_s16_m128i(b))));
 }
 
-// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers and saturates. https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
+// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
+// and saturates.
+//
+//   r0 := SignedSaturate(a0)
+//   r1 := SignedSaturate(a1)
+//   r2 := SignedSaturate(a2)
+//   r3 := SignedSaturate(a3)
+//   r4 := SignedSaturate(b0)
+//   r5 := SignedSaturate(b1)
+//   r6 := SignedSaturate(b2)
+//   r7 := SignedSaturate(b3)
+//
+// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
 FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
 {
-	return vreinterpretq_m128i_s16(vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)), vqmovn_s32(vreinterpretq_s32_m128i(b))));
+    return vreinterpretq_m128i_s16(
+            vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
+                         vqmovn_s32(vreinterpretq_s32_m128i(b))));
 }
 
-// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower 8 signed or unsigned 8-bit integers in b.  https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
+// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit integers
+// and saturates.
+//
+//   r0 := UnsignedSaturate(a0)
+//   r1 := UnsignedSaturate(a1)
+//   r2 := UnsignedSaturate(a2)
+//   r3 := UnsignedSaturate(a3)
+//   r4 := UnsignedSaturate(b0)
+//   r5 := UnsignedSaturate(b1)
+//   r6 := UnsignedSaturate(b2)
+//   r7 := UnsignedSaturate(b3)
+FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+            vcombine_u16(vqmovn_u32(vreinterpretq_u32_m128i(a)),
+                         vqmovn_u32(vreinterpretq_u32_m128i(b))));
+}
+
+// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
+// 8 signed or unsigned 8-bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//   ...
+//   r14 := a7
+//   r15 := b7
+//
+// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
 FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
 {
-	int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
-	int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
-	int8x8x2_t result = vzip_s8(a1, b1);
-	return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
 }
 
-// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the lower 4 signed or unsigned 16-bit integers in b.  https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
+// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
+// lower 4 signed or unsigned 16-bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//   r4 := a2
+//   r5 := b2
+//   r6 := a3
+//   r7 := b3
+//
+// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
 FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
 {
-	int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
-	int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
-	int16x4x2_t result = vzip_s16(a1, b1);
-	return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
 }
 
-// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the lower 2 signed or unsigned 32 - bit integers in b.  https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
+// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
+// lower 2 signed or unsigned 32 - bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//
+// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
 {
-	int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
-	int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
-	int32x2x2_t result = vzip_s32(a1, b1);
-	return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
 }
 
-// Selects and interleaves the lower two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
+{
+    int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
+}
+
+// Selects and interleaves the lower two single-precision, floating-point values
+// from a and b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//
+// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
 FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
 {
-	float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
-	float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
-	float32x2x2_t result = vzip_f32(a1, b1);
-	return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
 }
 
-// Selects and interleaves the upper two single-precision, floating-point values from a and b. https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
+// Selects and interleaves the upper two single-precision, floating-point values
+// from a and b.
+//
+//   r0 := a2
+//   r1 := b2
+//   r2 := a3
+//   r3 := b3
+//
+// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
 FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
 {
-	float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
-	float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
-	float32x2x2_t result = vzip_f32(a1, b1);
-	return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
 }
 
-// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper 8 signed or unsigned 8-bit integers in b.  https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
+// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
+// 8 signed or unsigned 8-bit integers in b.
+//
+//   r0 := a8
+//   r1 := b8
+//   r2 := a9
+//   r3 := b9
+//   ...
+//   r14 := a15
+//   r15 := b15
+//
+// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
 {
-	int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
-	int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
-	int8x8x2_t result = vzip_s8(a1, b1);
-	return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 =
+            vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 =
+            vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
 }
 
-// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the upper 4 signed or unsigned 16-bit integers in b.  https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
+// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
+// upper 4 signed or unsigned 16-bit integers in b.
+//
+//   r0 := a4
+//   r1 := b4
+//   r2 := a5
+//   r3 := b5
+//   r4 := a6
+//   r5 := b6
+//   r6 := a7
+//   r7 := b7
+//
+// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
 {
-	int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
-	int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
-	int16x4x2_t result = vzip_s16(a1, b1);
-	return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
 }
 
-// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the upper 2 signed or unsigned 32-bit integers in b.  https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
+// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
+// upper 2 signed or unsigned 32-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
 FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
 {
-	int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
-	int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
-	int32x2x2_t result = vzip_s32(a1, b1);
-	return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
 }
 
-// Extracts the selected signed or unsigned 16-bit integer from a and zero extends.  https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
-//FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
-#define _mm_extract_epi16(a, imm) \
-EXTENSION ({ \
-	(vgetq_lane_s16(vreinterpretq_s16_m128i(a), (imm)) & 0x0000ffffUL); \
-})
+// Interleaves the upper signed or unsigned 64-bit integer in a with the
+// upper signed or unsigned 64-bit integer in b.
+//
+//   r0 := a1
+//   r1 := b1
+FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
+{
+    int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
+}
 
-// Inserts the least significant 16 bits of b into the selected 16-bit integer of a. https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
-//FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, const int b, __constrange(0,8) int imm)
-#define _mm_insert_epi16(a, b, imm) \
-EXTENSION ({ \
-	vreinterpretq_m128i_s16(vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
-})
+// shift to right
+// https://msdn.microsoft.com/en-us/library/bb514041(v=vs.120).aspx
+// http://blog.csdn.net/hemmingway/article/details/44828303
+// Clang requires a macro here, as it is extremely picky about c being a literal.
+#define _mm_alignr_epi8(a, b, c) ((__m128i) vextq_s8((int8x16_t) (b), (int8x16_t) (a), (c)))
+
+// Extracts the selected signed or unsigned 8-bit integer from a and zero
+// extends.
+// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
+#define _mm_extract_epi8(a, imm) \
+    vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
+
+// Inserts the least significant 8 bits of b into the selected 8-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, const int b,
+// __constrange(0,16) int imm)
+#define _mm_insert_epi8(a, b, imm)                                  \
+    ({                                                               \
+        vreinterpretq_m128i_s8(                                     \
+            vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
+    })
+
+// Extracts the selected signed or unsigned 16-bit integer from a and zero
+// extends.
+// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
+// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
+#define _mm_extract_epi16(a, imm) \
+    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
+
+// Inserts the least significant 16 bits of b into the selected 16-bit integer
+// of a.
+// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
+// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, const int b,
+// __constrange(0,8) int imm)
+#define _mm_insert_epi16(a, b, imm)                                  \
+    ({                                                               \
+        vreinterpretq_m128i_s16(                                     \
+            vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
+    })
+
+// Extracts the selected signed or unsigned 32-bit integer from a and zero
+// extends.
+// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
+#define _mm_extract_epi32(a, imm) \
+    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
+
+// Inserts the least significant 32 bits of b into the selected 32-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, const int b,
+// __constrange(0,4) int imm)
+#define _mm_insert_epi32(a, b, imm)                                  \
+    ({                                                               \
+        vreinterpretq_m128i_s32(                                     \
+            vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
+    })
+
+
+// Extracts the selected signed or unsigned 64-bit integer from a and zero
+// extends.
+// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
+#define _mm_extract_epi64(a, imm) \
+    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
+
+// Inserts the least significant 64 bits of b into the selected 64-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, const __int64 b,
+// __constrange(0,2) int imm)
+#define _mm_insert_epi64(a, b, imm)                                  \
+    ({                                                               \
+        vreinterpretq_m128i_s64(                                     \
+            vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
+    })
 
 // ******************************************
-// Streaming Extensions
+// Crypto Extensions
 // ******************************************
-
-// Guarantees that every preceding store is globally visible before any subsequent store.  https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_sfence(void)
-{
-	__sync_synchronize();
-}
-
-// Stores the data in a to the address p without polluting the caches.  If the cache line containing address p is already in the cache, the cache will be updated.Address p must be 16 - byte aligned.  https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
+#if defined(__ARM_FEATURE_CRYPTO)
+// Wraps vmull_p64
+FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
 {
-	*p = a;
+    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
+    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
+    return vreinterpretq_u64_p128(vmull_p64(a, b));
 }
 
-// Cache line containing p is flushed and invalidated from all caches in the coherency domain. : https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
-FORCE_INLINE void _mm_clflush(void const*)
-{
-	// no corollary for Neon?
+#else // ARMv7 polyfill
+// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
+//
+// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
+// 64-bit->128-bit polynomial multiply.
+//
+// It needs some work and is somewhat slow, but it is still faster than all
+// known scalar methods.
+//
+// Algorithm adapted to C from http://www.workofard.com/2017/07/ghash-for-low-end-cores/,
+// which is adapted from "Fast Software Polynomial Multiplication on
+// ARM Processors Using the NEON Engine" by Danilo Camara, Conrado Gouvea,
+// Julio Lopez and Ricardo Dahab (https://hal.inria.fr/hal-01506572)
+static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly8x8_t a = vreinterpret_p8_u64(_a);
+    poly8x8_t b = vreinterpret_p8_u64(_b);
+
+    // Masks
+    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), vcreate_u8(0x00000000ffffffff));
+    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), vcreate_u8(0x0000000000000000));
+
+    // Do the multiplies, rotating with vext to get all combinations
+    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));                // D = A0 * B0
+    uint8x16_t e = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
+    uint8x16_t f = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
+    uint8x16_t g = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
+    uint8x16_t h = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
+    uint8x16_t i = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
+    uint8x16_t j = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
+    uint8x16_t k = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
+
+    // Add cross products
+    uint8x16_t l = veorq_u8(e, f); // L = E + F
+    uint8x16_t m = veorq_u8(g, h); // M = G + H
+    uint8x16_t n = veorq_u8(i, j); // N = I + J
+
+    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL instructions.
+#if defined(__aarch64__)
+    uint8x16_t lm_p0 = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t lm_p1 = vreinterpretq_u8_u64(vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t nk_p0 = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+    uint8x16_t nk_p1 = vreinterpretq_u8_u64(vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+#else
+    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l),  vget_low_u8(m));
+    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
+    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n),  vget_low_u8(k));
+    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
+#endif
+    // t0 = (L) (P0 + P1) << 8
+    // t1 = (M) (P2 + P3) << 16
+    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
+    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
+    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
+
+    // t2 = (N) (P4 + P5) << 24
+    // t3 = (K) (P6 + P7) << 32
+    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
+    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
+    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
+
+    // De-interleave
+#if defined(__aarch64__)
+    uint8x16_t t0 = vreinterpretq_u8_u64(vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t1 = vreinterpretq_u8_u64(vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t2 = vreinterpretq_u8_u64(vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+    uint8x16_t t3 = vreinterpretq_u8_u64(vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+#else
+    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
+    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l),  vget_low_u8(t0t1_h));
+    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
+    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l),  vget_low_u8(t2t3_h));
+#endif
+    // Shift the cross products
+    uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
+    uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
+    uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
+    uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
+
+    // Accumulate the products
+    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
+    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
+    uint8x16_t mix    = veorq_u8(d, cross1);
+    uint8x16_t r      = veorq_u8(mix, cross2);
+    return vreinterpretq_u64_u8(r);
+}
+
+#endif // ARMv7 polyfill
+FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
+{
+    uint64x2_t a = vreinterpretq_u64_m128i(_a);
+    uint64x2_t b = vreinterpretq_u64_m128i(_b);
+    switch (imm & 0x11) {
+        case 0x00: return vreinterpretq_m128i_u64(_sse2neon_vmull_p64(vget_low_u64(a),  vget_low_u64(b)));
+        case 0x01: return vreinterpretq_m128i_u64(_sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
+        case 0x10: return vreinterpretq_m128i_u64(_sse2neon_vmull_p64(vget_low_u64(a),  vget_high_u64(b)));
+        case 0x11: return vreinterpretq_m128i_u64(_sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
+        default: abort();
+    }
+}
+
+#if !defined(__ARM_FEATURE_CRYPTO) && defined(__aarch64__)
+// In the absence of crypto extensions, implement aesenc using regular neon
+// intrinsics instead. See:
+// http://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
+// http://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
+// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
+// for more information Reproduced with permission of the author.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
+{
+    static const uint8_t crypto_aes_sbox[256] = {
+        0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b,
+        0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
+        0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26,
+        0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+        0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
+        0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
+        0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed,
+        0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+        0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f,
+        0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
+        0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec,
+        0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
+        0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14,
+        0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
+        0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
+        0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
+        0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f,
+        0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
+        0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11,
+        0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+        0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f,
+        0xb0, 0x54, 0xbb, 0x16};
+    static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
+                                         0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
+                                         0xc, 0x1, 0x6, 0xb};
+    static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+                                       0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
+
+    // shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    // sub bytes
+    v = vqtbl4q_u8(vld1q_u8_x4(crypto_aes_sbox), w);
+    v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0xc0), w - 0xc0);
+
+    // mix columns
+    w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    //  add round key
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+}
+#elif defined(__ARM_FEATURE_CRYPTO)
+// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
+// AESMC and then manually applying the real key as an xor operation This
+// unfortunately means an additional xor op; the compiler should be able to
+// optimise this away for repeated calls however See
+// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
+// for more details.
+inline __m128i _mm_aesenc_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
+        vreinterpretq_u8_m128i(b));
 }
+#endif
 
-// *****
-// Needed in MMseqs2
-
-#define _mm_extract_epi8(a, imm) \
-EXTENSION ({ \
-    (vgetq_lane_s8(vreinterpretq_s8_m128i(a), (imm)) & 0x000000ffUL); \
-})
-
-#define _mm_srli_epi64(a, imm) \
-EXTENSION ({ \
-    __m128i ret; \
-    if ((imm) <= 0) { \
-        ret = a; \
-    } \
-    else if ((imm)> 31) { \
-        ret = _mm_setzero_si128(); \
-    } \
-    else { \
-        ret = vreinterpretq_m128i_u64(vshrq_n_u64(vreinterpretq_u64_m128i(a), (imm))); \
-    } \
-    ret; \
-})
-
-#define _mm_slli_epi64(a, imm) \
-EXTENSION ({ \
-    __m128i ret; \
-    if ((imm) <= 0) {\
-        ret = a; \
-    } \
-    else if ((imm) > 31) { \
-        ret = _mm_setzero_si128(); \
-    } \
-    else { \
-        ret = vreinterpretq_m128i_s64(vshlq_n_s64(vreinterpretq_s64_m128i(a), (imm))); \
-    } \
-    ret; \
-})
+// ******************************************
+// Streaming Extensions
+// ******************************************
 
-FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
+// Guarantees that every preceding store is globally visible before any
+// subsequent store.
+// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_sfence(void)
 {
-    /* apoty: issue with large values because of result saturation */
-    //int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)); /* =2*a*b */
-    //return vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
-    uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
-    uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
-    uint32x4_t ab3210 = vmull_u16(a3210, b3210); /* 3333222211110000 */
-    uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
-    uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
-    uint32x4_t ab7654 = vmull_u16(a7654, b7654); /* 7777666655554444 */
-    uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
-    return vreinterpretq_m128i_u16(r.val[1]);
+    __sync_synchronize();
 }
 
-FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
+// Stores the data in a to the address p without polluting the caches.  If the
+// cache line containing address p is already in the cache, the cache will be
+// updated.Address p must be 16 - byte aligned.
+// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
 {
-    return vreinterpretq_m128i_u32(vmulq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
 }
 
-FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
+// Cache line containing p is flushed and invalidated from all caches in the
+// coherency domain. :
+// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
+FORCE_INLINE void _mm_clflush(void const *p)
 {
-    return vreinterpretq_m128i_u32(vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+    (void)p;
+    // no corollary for Neon?
 }
 
 #if defined(__GNUC__) || defined(__clang__)
-#	pragma pop_macro("ALIGN_STRUCT")
-#	pragma pop_macro("FORCE_INLINE")
+#pragma pop_macro("ALIGN_STRUCT")
+#pragma pop_macro("FORCE_INLINE")
 #endif
 
 #endif
diff --git a/lib/simd/sse2wasm.h b/lib/simd/sse2wasm.h
new file mode 100644
index 0000000..1f0fea3
--- /dev/null
+++ b/lib/simd/sse2wasm.h
@@ -0,0 +1,166 @@
+// sse2wasm is still very incomplete
+// licensed under GPLv3 see LICENCE file
+#ifndef SSE2WASM
+#define SSE2WASM
+
+#define __wasm_unimplemented_simd128__
+#include <wasm_simd128.h>
+#define SSE 1
+typedef v128_t __m128d;
+typedef v128_t __m128i;
+typedef v128_t __m128;
+
+#define _mm_add_ps        wasm_f32x4_add
+#define _mm_sub_ps        wasm_f32x4_sub
+#define _mm_mul_ps        wasm_f32x4_mul
+#define _mm_div_ps        wasm_f32x4_div
+#define _mm_rcp_ps(x)     wasm_f32x4_div(wasm_f32x4_splat(1), (x))
+#define _mm_max_ps        wasm_f32x4_max
+#define _mm_min_ps        wasm_f32x4_min
+#define _mm_load_ps       wasm_v128_load
+#define _mm_load_ss       wasm_v128_load
+#define _mm_store_ps      wasm_v128_store
+#define _mm_store_ss      wasm_v128_store
+#define _mm_set1_ps       wasm_f32x4_splat
+#define _mm_setzero_ps(x) wasm_f32x4_splat(0)
+#define _mm_cmpgt_ps      wasm_f32x4_gt
+#define _mm_cmpeq_ps      wasm_f32x4_eq
+#define _mm_cmplt_ps      wasm_f32x4_lt
+#define _mm_or_ps         wasm_v128_or
+#define _mm_and_ps        wasm_v128_and
+#define _mm_andnot_ps     wasm_v128_andnot
+#define _mm_xor_ps        wasm_v128_xor
+#define _mm_cvtps_epi32   NOT_YET_IMP
+#define _mm_castps_si128  NOT_YET_IMP
+
+#define _mm_add_epi32     wasm_i32x4_add
+#define _mm_add_epi16     wasm_i16x8_add
+#define _mm_add_epi8      wasm_i8x16_add
+#define _mm_adds_epi16    wasm_i16x8_add_saturate
+#define _mm_adds_epu8     wasm_u8x16_add_saturate
+#define _mm_sub_epi32     wasm_i32x4_sub
+#define _mm_sub_epi16     wasm_i16x8_sub
+#define _mm_sub_epi8      wasm_i8x16_sub
+#define _mm_subs_epu16    wasm_u16x8_sub_saturate
+#define _mm_subs_epu8     wasm_u8x16_sub_saturate
+#define _mm_mullo_epi32   NOT_YET_IMP
+#define _mm_max_epi32     wasm_i32x4_max_s
+#define _mm_max_epi16     wasm_i16x8_max_s
+#define _mm_max_epu8      wasm_i16x8_max_u
+#define _mm_min_epu8      wasm_i16x8_min_u
+#define _mm_load_si128    wasm_v128_load
+#define _mm_loadu_si128   wasm_v128_load
+#define _mm_stream_load_si128 NOT_YET_IMP
+#define _mm_storeu_si128  wasm_v128_store
+#define _mm_store_si128   wasm_v128_store
+#define _mm_set1_epi32    wasm_i32x4_splat
+#define _mm_set1_epi16    wasm_i16x8_splat
+#define _mm_set1_epi8     wasm_i8x16_splat
+#define _mm_set_epi32     wasm_i32x4_make
+#define _mm_set_epi16     wasm_i16x8_make
+#define _mm_set_epi8      wasm_i8x16_make
+#define _mm_setzero_si128(x) wasm_i64x2_splat(0)
+#define _mm_cmpgt_epi32   wasm_i32x4_gt
+#define _mm_cmpgt_epi16   wasm_i16x8_gt
+#define _mm_cmpgt_epi8    wasm_i8x16_gt
+#define _mm_cmpeq_epi32   wasm_i32x4_eq
+#define _mm_cmpeq_epi16   wasm_i16x8_eq
+#define _mm_cmpeq_epi8    wasm_i8x16_eq
+#define _mm_cmplt_epi32   wasm_i32x4_lt
+#define _mm_cmplt_epi16   wasm_i16x8_lt
+#define _mm_cmplt_epi8    wasm_i8x16_lt
+#define _mm_or_si128      wasm_v128_or
+#define _mm_and_si128     wasm_v128_and
+#define _mm_andnot_si128  wasm_v128_andnot
+#define _mm_xor_si128     wasm_v128_xor
+#define _mm_extract_epi16 wasm_i16x8_extract_lane
+#define _mm_extract_epi8  wasm_i8x16_extract_lane
+#define _mm_slli_epi16    NOT_YET_IMP
+#define _mm_srli_epi16    NOT_YET_IMP
+#define _mm_slli_epi32    NOT_YET_IMP
+#define _mm_srli_epi32    NOT_YET_IMP
+#define _mm_cvtepi32_ps(x) (__m128)((__m128i)(x))
+#define _mm_castsi128_ps  NOT_YET_IMP
+
+static inline void _mm_storel_epi64(__m128i* mem_addr, __m128i a) {
+    return;
+}
+
+static inline __m128i _mm_setr_epi32(int e3, int e2, int e1, int e0) {
+    union {
+        int32_t as_i32[4];
+        __m128i  as_vec;
+    } t;
+    t.as_i32[0] = e3;
+    t.as_i32[1] = e2;
+    t.as_i32[2] = e1;
+    t.as_i32[3] = e0;
+    return t.as_vec;
+}
+
+//__builtin_convertvector
+
+static inline __m128i _mm_slli_si128(__m128i a, int imm8) {
+    return _mm_setzero_si128();
+}
+
+static inline __m128i _mm_srli_si128(__m128i a, int imm8) {
+    return _mm_setzero_si128();
+}
+
+static inline unsigned short _mm_movemask_epi8(__m128i a) {
+    unsigned int result=0;
+
+    union {
+        __m128i si;
+        char as_char[16];
+    } t;
+    t.si = a;
+    result |= (t.as_char[15] & 0x80) << (15-7);
+    result |= (t.as_char[14] & 0x80) << (14-7);
+    result |= (t.as_char[13] & 0x80) << (13-7);
+    result |= (t.as_char[12] & 0x80) << (12-7);
+    result |= (t.as_char[11] & 0x80) << (11-7);
+    result |= (t.as_char[10] & 0x80) << (10-7);
+    result |= (t.as_char[9]  & 0x80) <<  (9-7);
+    result |= (t.as_char[8]  & 0x80) <<  (8-7);
+    result |= (t.as_char[7]  & 0x80);
+    result |= (t.as_char[6]  & 0x80) >>  (7-6);
+    result |= (t.as_char[5]  & 0x80) >>  (7-5);
+    result |= (t.as_char[4]  & 0x80) >>  (7-4);
+    result |= (t.as_char[3]  & 0x80) >>  (7-3);
+    result |= (t.as_char[2]  & 0x80) >>  (7-2);
+    result |= (t.as_char[1]  & 0x80) >>  (7-1);
+    result |= (t.as_char[0]  & 0x80) >>   7;
+
+    return result;
+}
+
+#define _MM_SHUFFLE(a, b, c, d) _mm_setzero_si128()
+
+
+static inline __m128i _mm_shuffle_epi32(__m128i a, __m128i b) {
+    return _mm_setzero_si128();
+}
+
+static inline __m128i _mm_shuffle_epi16(__m128i a, __m128i b) {
+    return _mm_setzero_si128();
+}
+// wasm_v8x16_shuffle
+static inline __m128i _mm_shuffle_epi8(__m128i a, __m128i b) {
+    return _mm_setzero_si128();
+}
+
+static inline int64_t _mm_cvtsi128_si64(__m128 a) {
+    return wasm_i64x2_extract_lane(a, 0);
+}
+
+static inline __m128i _mm_cvtsi64_si128(int64_t a) {
+    return wasm_i64x2_make(a, 0);
+}
+
+static inline __m128i _mm_cvtsi32_si128(int a) {
+    return wasm_i64x2_make((int64_t)a, 0);
+}
+
+#endif
diff --git a/lib/tinyexpr/CMakeLists.txt b/lib/tinyexpr/CMakeLists.txt
index 3d37e57..8ca4154 100644
--- a/lib/tinyexpr/CMakeLists.txt
+++ b/lib/tinyexpr/CMakeLists.txt
@@ -12,6 +12,9 @@ option(build_tinyexpr_example2 "Build TinyExpr example 2." OFF)
 option(build_tinyexpr_example3 "Build TinyExpr example 3." OFF)
 
 find_library(MATH_LIB m)
+if(NOT MATH_LIB)
+    set(MATH_LIB "")
+endif()
 
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ansi -Wall -Wshadow -fPIC -O3")
 
diff --git a/lib/xxhash/xxh3.h b/lib/xxhash/xxh3.h
new file mode 100644
index 0000000..04fbaeb
--- /dev/null
+++ b/lib/xxhash/xxh3.h
@@ -0,0 +1,1632 @@
+/*
+   xxHash - Extremely Fast Hash algorithm
+   Development source file for `xxh3`
+   Copyright (C) 2019-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+/* Note :
+   This file is separated for development purposes.
+   It will be integrated into `xxhash.c` when development phase is complete.
+*/
+
+#ifndef XXH3_H
+#define XXH3_H
+
+
+/* ===   Dependencies   === */
+
+#undef XXH_INLINE_ALL   /* in case it's already defined */
+#define XXH_INLINE_ALL
+#include "xxhash.h"
+
+
+/* ===   Compiler specifics   === */
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+#  define XXH_RESTRICT   restrict
+#else
+/* note : it might be useful to define __restrict or __restrict__ for some C++ compilers */
+#  define XXH_RESTRICT   /* disable */
+#endif
+
+#if defined(__GNUC__)
+#  if defined(__AVX2__)
+#    include <immintrin.h>
+#  elif defined(__SSE2__)
+#    include <emmintrin.h>
+#  elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+#    define inline __inline__  /* clang bug */
+#    include <arm_neon.h>
+#    undef inline
+#  endif
+#elif defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+/*
+ * Sanity check.
+ *
+ * XXH3 only requires these features to be efficient:
+ *
+ *  - Usable unaligned access
+ *  - A 32-bit or 64-bit ALU
+ *      - If 32-bit, a decent ADC instruction
+ *  - A 32 or 64-bit multiply with a 64-bit result
+ *
+ * Almost all 32-bit and 64-bit targets meet this, except for Thumb-1, the
+ * classic 16-bit only subset of ARM's instruction set.
+ *
+ * First of all, Thumb-1 lacks support for the UMULL instruction which
+ * performs the important long multiply. This means numerous __aeabi_lmul
+ * calls.
+ *
+ * Second of all, the 8 functional registers are just not enough.
+ * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
+ * Lo registers, and this shuffling results in thousands more MOVs than A32.
+ *
+ * A32 and T32 don't have this limitation. They can access all 14 registers,
+ * do a 32->64 multiply with UMULL, and the flexible operand is helpful too.
+ *
+ * If compiling Thumb-1 for a target which supports ARM instructions, we
+ * will give a warning.
+ *
+ * Usually, if this happens, it is because of an accident and you probably
+ * need to specify -march, as you probably meant to compileh for a newer
+ * architecture.
+ */
+#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
+#   warning "XXH3 is highly inefficient without ARM or Thumb-2."
+#endif
+
+/* ==========================================
+ * Vectorization detection
+ * ========================================== */
+#define XXH_SCALAR 0
+#define XXH_SSE2   1
+#define XXH_AVX2   2
+#define XXH_NEON   3
+#define XXH_VSX    4
+
+#ifndef XXH_VECTOR    /* can be defined on command line */
+#  if defined(__AVX2__)
+#    define XXH_VECTOR XXH_AVX2
+#  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+#    define XXH_VECTOR XXH_SSE2
+#  elif defined(__GNUC__) /* msvc support maybe later */ \
+  && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
+  && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
+    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+#    define XXH_VECTOR XXH_NEON
+#  elif defined(__PPC64__) && defined(__POWER8_VECTOR__) && defined(__GNUC__)
+#    define XXH_VECTOR XXH_VSX
+#  else
+#    define XXH_VECTOR XXH_SCALAR
+#  endif
+#endif
+
+/* control alignment of accumulator,
+ * for compatibility with fast vector loads */
+#ifndef XXH_ACC_ALIGN
+#  if XXH_VECTOR == 0   /* scalar */
+#     define XXH_ACC_ALIGN 8
+#  elif XXH_VECTOR == 1  /* sse2 */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == 2  /* avx2 */
+#     define XXH_ACC_ALIGN 32
+#  elif XXH_VECTOR == 3  /* neon */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == 4  /* vsx */
+#     define XXH_ACC_ALIGN 16
+#  endif
+#endif
+
+/* xxh_u64 XXH_mult32to64(xxh_u32 a, xxh_u64 b) { return (xxh_u64)a * (xxh_u64)b; } */
+#if defined(_MSC_VER) && defined(_M_IX86)
+#    include <intrin.h>
+#    define XXH_mult32to64(x, y) __emulu(x, y)
+#else
+#    define XXH_mult32to64(x, y) ((xxh_u64)((x) & 0xFFFFFFFF) * (xxh_u64)((y) & 0xFFFFFFFF))
+#endif
+
+/* VSX stuff. It's a lot because VSX support is mediocre across compilers and
+ * there is a lot of mischief with endianness. */
+#if XXH_VECTOR == XXH_VSX
+#  include <altivec.h>
+#  undef vector
+typedef __vector unsigned long long U64x2;
+typedef __vector unsigned char U8x16;
+typedef __vector unsigned U32x4;
+
+#ifndef XXH_VSX_BE
+#  if defined(__BIG_ENDIAN__) \
+  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_VSX_BE 1
+#  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+#    warning "-maltivec=be is not recommended. Please use native endianness."
+#    define XXH_VSX_BE 1
+#  else
+#    define XXH_VSX_BE 0
+#  endif
+#endif
+
+/* We need some helpers for big endian mode. */
+#if XXH_VSX_BE
+/* A wrapper for POWER9's vec_revb. */
+#  ifdef __POWER9_VECTOR__
+#    define XXH_vec_revb vec_revb
+#  else
+XXH_FORCE_INLINE U64x2 XXH_vec_revb(U64x2 val)
+{
+    U8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+                              0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
+    return vec_perm(val, val, vByteSwap);
+}
+#  endif
+
+/* Power8 Crypto gives us vpermxor which is very handy for
+ * PPC64EB.
+ *
+ * U8x16 vpermxor(U8x16 a, U8x16 b, U8x16 mask)
+ * {
+ *     U8x16 ret;
+ *     for (int i = 0; i < 16; i++) {
+ *         ret[i] = a[mask[i] & 0xF] ^ b[mask[i] >> 4];
+ *     }
+ *     return ret;
+ * }
+ *
+ * Because both of the main loops load the key, swap, and xor it with input,
+ * we can combine the key swap into this instruction.
+ */
+#  ifdef vec_permxor
+#    define XXH_vec_permxor vec_permxor
+#  else
+#    define XXH_vec_permxor __builtin_crypto_vpermxor
+#  endif
+#endif  /* XXH_VSX_BE */
+/*
+ * Because we reinterpret the multiply, there are endian memes: vec_mulo actually becomes
+ * vec_mule.
+ *
+ * Additionally, the intrinsic wasn't added until GCC 8, despite existing for a while.
+ * Clang has an easy way to control this, we can just use the builtin which doesn't swap.
+ * GCC needs inline assembly. */
+#if __has_builtin(__builtin_altivec_vmuleuw)
+#  define XXH_vec_mulo __builtin_altivec_vmulouw
+#  define XXH_vec_mule __builtin_altivec_vmuleuw
+#else
+/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
+XXH_FORCE_INLINE U64x2 XXH_vec_mulo(U32x4 a, U32x4 b) {
+    U64x2 result;
+    __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+XXH_FORCE_INLINE U64x2 XXH_vec_mule(U32x4 a, U32x4 b) {
+    U64x2 result;
+    __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+#endif /* __has_builtin(__builtin_altivec_vmuleuw) */
+#endif /* XXH_VECTOR == XXH_VSX */
+
+/* prefetch
+ * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+#if defined(XXH_NO_PREFETCH)
+#  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
+#else
+#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#  else
+#    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
+#  endif
+#endif  /* XXH_NO_PREFETCH */
+
+
+/* ==========================================
+ * XXH3 default settings
+ * ========================================== */
+
+#define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+
+#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
+#  error "default keyset is not large enough"
+#endif
+
+XXH_ALIGN(64) static const xxh_u8 kSecret[XXH_SECRET_DEFAULT_SIZE] = {
+        0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+        0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+        0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+        0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+        0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+        0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+        0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+        0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+
+        0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+        0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+        0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+        0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+};
+
+/*
+ * GCC for x86 has a tendency to use SSE in this loop. While it
+ * successfully avoids swapping (as MUL overwrites EAX and EDX), it
+ * slows it down because instead of free register swap shifts, it
+ * must use pshufd and punpckl/hd.
+ *
+ * To prevent this, we use this attribute to shut off SSE.
+ */
+#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
+__attribute__((__target__("no-sse")))
+#endif
+static XXH128_hash_t
+XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
+{
+    /*
+     * GCC/Clang __uint128_t method.
+     *
+     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+     * This is usually the best way as it usually uses a native long 64-bit
+     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+     *
+     * Usually.
+     *
+     * Despite being a 32-bit platform, Clang (and emscripten) define this
+     * type despite not having the arithmetic for it. This results in a
+     * laggy compiler builtin call which calculates a full 128-bit multiply.
+     * In that case it is best to use the portable one.
+     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+     */
+#if defined(__GNUC__) && !defined(__wasm__) \
+    && defined(__SIZEOF_INT128__) \
+    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+    __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs;
+    XXH128_hash_t const r128 = { (xxh_u64)(product), (xxh_u64)(product >> 64) };
+    return r128;
+
+    /*
+     * MSVC for x64's _umul128 method.
+     *
+     * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
+     *
+     * This compiles to single operand MUL on x64.
+     */
+#elif defined(_M_X64) || defined(_M_IA64)
+
+    #ifndef _MSC_VER
+#   pragma intrinsic(_umul128)
+#endif
+    xxh_u64 product_high;
+    xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
+    XXH128_hash_t const r128 = { product_low, product_high };
+    return r128;
+
+#else
+    /*
+     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
+     *
+     * This is a fast and simple grade school multiply, which is shown
+     * below with base 10 arithmetic instead of base 0x100000000.
+     *
+     *           9 3 // D2 lhs = 93
+     *         x 7 5 // D2 rhs = 75
+     *     ----------
+     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10)
+     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10)
+     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10)
+     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10)
+     *     ---------
+     *         2 7 | // D2 cross  = (15 / 10) + (45 % 10) + 21
+     *     + 6 7 | | // D2 upper  = (27 / 10) + (45 / 10) + 63
+     *     ---------
+     *       6 9 7 5
+     *
+     * The reasons for adding the products like this are:
+     *  1. It avoids manual carry tracking. Just like how
+     *     (9 * 9) + 9 + 9 = 99, the same applies with this for
+     *     UINT64_MAX. This avoids a lot of complexity.
+     *
+     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
+     *     instruction available in ARMv6+ A32/T32, which is shown below:
+     *
+     *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
+     *         {
+     *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
+     *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
+     *             *RdHi = (xxh_u32)(product >> 32);
+     *         }
+     *
+     *     This instruction was designed for efficient long multiplication,
+     *     and allows this to be calculated in only 4 instructions which
+     *     is comparable to some 64-bit ALUs.
+     *
+     *  3. It isn't terrible on other platforms. Usually this will be
+     *     a couple of 32-bit ADD/ADCs.
+     */
+
+    /* First calculate all of the cross products. */
+    xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+    xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
+    xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+    xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);
+
+    /* Now add the products together. These will never overflow. */
+    xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+    xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
+    xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+    XXH128_hash_t r128 = { lower, upper };
+    return r128;
+#endif
+}
+
+/*
+ * We want to keep the attribute here because a target switch
+ * disables inlining.
+ *
+ * Does a 64-bit to 128-bit multiply, then XOR folds it.
+ * The reason for the separate function is to prevent passing
+ * too many structs around by value. This will hopefully inline
+ * the multiply, but we don't force it.
+ */
+#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
+__attribute__((__target__("no-sse")))
+#endif
+static xxh_u64
+XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
+{
+    XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
+    return product.low64 ^ product.high64;
+}
+
+
+static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
+{
+    h64 ^= h64 >> 37;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+    return h64;
+}
+
+
+/* ==========================================
+ * Short keys
+ * ========================================== */
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32  const combined = ((xxh_u32)c1) | (((xxh_u32)c2) << 8) | (((xxh_u32)c3) << 16) | (((xxh_u32)len) << 24);
+        xxh_u64  const keyed = (xxh_u64)combined ^ (XXH_readLE32(secret) + seed);
+        xxh_u64  const mixed = keyed * PRIME64_1;
+        return XXH3_avalanche(mixed);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    {   xxh_u32 const input_lo = XXH_readLE32(input);
+        xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+        xxh_u64 const input_64 = input_lo | ((xxh_u64)input_hi << 32);
+        xxh_u64 const keyed = input_64 ^ (XXH_readLE64(secret) + seed);
+        xxh_u64 const mix64 = len + ((keyed ^ (keyed >> 51)) * PRIME32_1);
+        return XXH3_avalanche((mix64 ^ (mix64 >> 47)) * PRIME64_2);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const input_lo = XXH_readLE64(input)           ^ (XXH_readLE64(secret)     + seed);
+        xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ (XXH_readLE64(secret + 8) - seed);
+        xxh_u64 const acc = len + (input_lo + input_hi) + XXH3_mul128_fold64(input_lo, input_hi);
+        return XXH3_avalanche(acc);
+    }
+}
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (len > 8) return XXH3_len_9to16_64b(input, len, secret, seed);
+        if (len >= 4) return XXH3_len_4to8_64b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
+        return 0;
+    }
+}
+
+
+/* ===    Long Keys    === */
+
+#define STRIPE_LEN 64
+#define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
+#define ACC_NB (STRIPE_LEN / sizeof(xxh_u64))
+
+typedef enum { XXH3_acc_64bits, XXH3_acc_128bits } XXH3_accWidth_e;
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512(      void* XXH_RESTRICT acc,
+                          const void* XXH_RESTRICT input,
+                          const void* XXH_RESTRICT secret,
+                          XXH3_accWidth_e accWidth)
+{
+#if (XXH_VECTOR == XXH_AVX2)
+
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXH_ALIGN(32) __m256i* const xacc  =       (__m256i *) acc;
+        const         __m256i* const xinput = (const __m256i *) input;  /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
+        const         __m256i* const xsecret = (const __m256i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            __m256i const data_vec = _mm256_loadu_si256 (xinput+i);
+            __m256i const key_vec = _mm256_loadu_si256 (xsecret+i);
+            __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
+            __m256i const product = _mm256_mul_epu32 (data_key, _mm256_shuffle_epi32 (data_key, 0x31));  /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
+            if (accWidth == XXH3_acc_128bits) {
+                __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+                __m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
+                xacc[i]  = _mm256_add_epi64(product, sum);
+            } else {  /* XXH3_acc_64bits */
+                __m256i const sum = _mm256_add_epi64(xacc[i], data_vec);
+                xacc[i]  = _mm256_add_epi64(product, sum);
+            }
+        }   }
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXH_ALIGN(16) __m128i* const xacc  =       (__m128i *) acc;
+        const         __m128i* const xinput = (const __m128i *) input;  /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
+        const         __m128i* const xsecret = (const __m128i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            __m128i const data_vec = _mm_loadu_si128 (xinput+i);
+            __m128i const key_vec = _mm_loadu_si128 (xsecret+i);
+            __m128i const data_key = _mm_xor_si128 (data_vec, key_vec);                                  /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */
+            __m128i const product = _mm_mul_epu32 (data_key, _mm_shuffle_epi32 (data_key, 0x31));  /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
+            if (accWidth == XXH3_acc_128bits) {
+                __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+                __m128i const sum = _mm_add_epi64(xacc[i], data_swap);
+                xacc[i]  = _mm_add_epi64(product, sum);
+            } else {  /* XXH3_acc_64bits */
+                __m128i const sum = _mm_add_epi64(xacc[i], data_vec);
+                xacc[i]  = _mm_add_epi64(product, sum);
+            }
+    }   }
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {
+        XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
+        /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
+        uint8_t const* const xinput = (const uint8_t *) input;
+        uint8_t const* const xsecret  = (const uint8_t *) secret;
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
+#if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */
+            /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this.
+             * vzip on 32-bit ARM NEON will overwrite the original register, and I think that Clang
+             * assumes I don't want to destroy it and tries to make a copy. This slows down the code
+             * a lot.
+             * aarch64 not only uses an entirely different syntax, but it requires three
+             * instructions...
+             *    ext    v1.16B, v0.16B, #8    // select high bits because aarch64 can't address them directly
+             *    zip1   v3.2s, v0.2s, v1.2s   // first zip
+             *    zip2   v2.2s, v0.2s, v1.2s   // second zip
+             * ...to do what ARM does in one:
+             *    vzip.32 d0, d1               // Interleave high and low bits and overwrite. */
+
+            /* data_vec = xsecret[i]; */
+            uint8x16_t const data_vec    = vld1q_u8(xinput + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint8x16_t const key_vec     = vld1q_u8(xsecret  + (i * 16));
+            /* data_key = data_vec ^ key_vec; */
+            uint32x4_t       data_key;
+
+            if (accWidth == XXH3_acc_64bits) {
+                /* Add first to prevent register swaps */
+                /* xacc[i] += data_vec; */
+                xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
+            } else {  /* XXH3_acc_128bits */
+                /* xacc[i] += swap(data_vec); */
+                /* can probably be optimized better */
+                uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
+                uint64x2_t const swapped= vextq_u64(data64, data64, 1);
+                xacc[i] = vaddq_u64 (xacc[i], swapped);
+            }
+
+            data_key = vreinterpretq_u32_u8(veorq_u8(data_vec, key_vec));
+
+            /* Here's the magic. We use the quirkiness of vzip to shuffle data_key in place.
+             * shuffle: data_key[0, 1, 2, 3] = data_key[0, 2, 1, 3] */
+            __asm__("vzip.32 %e0, %f0" : "+w" (data_key));
+            /* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t) data_key[2, 3]; */
+            xacc[i] = vmlal_u32(xacc[i], vget_low_u32(data_key), vget_high_u32(data_key));
+
+#else
+            /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */
+
+            /* data_vec = xsecret[i]; */
+            uint8x16_t const data_vec    = vld1q_u8(xinput + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint8x16_t const key_vec     = vld1q_u8(xsecret  + (i * 16));
+            /* data_key = data_vec ^ key_vec; */
+            uint64x2_t const data_key    = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
+            /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */
+            uint32x2_t const data_key_lo = vmovn_u64  (data_key);
+            /* data_key_hi = (uint32x2_t) (data_key >> 32); */
+            uint32x2_t const data_key_hi = vshrn_n_u64 (data_key, 32);
+            if (accWidth == XXH3_acc_64bits) {
+                /* xacc[i] += data_vec; */
+                xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec));
+            } else {  /* XXH3_acc_128bits */
+                /* xacc[i] += swap(data_vec); */
+                uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
+                uint64x2_t const swapped= vextq_u64(data64, data64, 1);
+                xacc[i] = vaddq_u64 (xacc[i], swapped);
+            }
+            /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
+            xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
+
+#endif
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_VSX)
+          U64x2* const xacc =        (U64x2*) acc;    /* presumed aligned */
+    U64x2 const* const xinput = (U64x2 const*) input;   /* no alignment restriction */
+    U64x2 const* const xsecret  = (U64x2 const*) secret;    /* no alignment restriction */
+    U64x2 const v32 = { 32,  32 };
+#if XXH_VSX_BE
+    U8x16 const vXorSwap  = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
+                              0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
+#endif
+    size_t i;
+    for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
+        /* data_vec = xinput[i]; */
+        /* key_vec = xsecret[i]; */
+#if XXH_VSX_BE
+        /* byteswap */
+        U64x2 const data_vec = XXH_vec_revb(vec_vsx_ld(0, xinput + i));
+        U64x2 const key_raw = vec_vsx_ld(0, xsecret + i);
+        /* See comment above. data_key = data_vec ^ swap(xsecret[i]); */
+        U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
+#else
+        U64x2 const data_vec = vec_vsx_ld(0, xinput + i);
+        U64x2 const key_vec = vec_vsx_ld(0, xsecret + i);
+        U64x2 const data_key = data_vec ^ key_vec;
+#endif
+        /* shuffled = (data_key << 32) | (data_key >> 32); */
+        U32x4 const shuffled = (U32x4)vec_rl(data_key, v32);
+        /* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled & 0xFFFFFFFF); */
+        U64x2 const product = XXH_vec_mulo((U32x4)data_key, shuffled);
+        xacc[i] += product;
+
+        if (accWidth == XXH3_acc_64bits) {
+            xacc[i] += data_vec;
+        } else {  /* XXH3_acc_128bits */
+            /* swap high and low halves */
+            U64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
+            xacc[i] += data_swapped;
+        }
+    }
+
+#else   /* scalar variant of Accumulator - universal */
+
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;    /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
+    const xxh_u8* const xinput = (const xxh_u8*) input;  /* no alignment restriction */
+    const xxh_u8* const xsecret  = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < ACC_NB; i++) {
+        xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
+        xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
+
+        if (accWidth == XXH3_acc_64bits) {
+            xacc[i] += data_val;
+        } else {
+            xacc[i ^ 1] += data_val; /* swap adjacent lanes */
+        }
+        xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
+    }
+#endif
+}
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+#if (XXH_VECTOR == XXH_AVX2)
+
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
+        const         __m256i* const xsecret = (const __m256i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this argument type */
+        const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m256i const acc_vec     = xacc[i];
+            __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
+            __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+
+            /* xacc[i] *= PRIME32_1; */
+            __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, 0x31);
+            __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
+            __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
+        const         __m128i* const xsecret = (const __m128i *) secret;   /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this argument type */
+        const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m128i const acc_vec     = xacc[i];
+            __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
+            __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+
+            /* xacc[i] *= PRIME32_1; */
+            __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, 0x31);
+            __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
+            __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
+        }
+    }
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   uint64x2_t* const xacc =     (uint64x2_t*) acc;
+        uint8_t const* const xsecret = (uint8_t const*) secret;
+        uint32x2_t const prime     = vdup_n_u32 (PRIME32_1);
+
+        size_t i;
+        for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) {
+            /* data_vec = xacc[i] ^ (xacc[i] >> 47); */
+            uint64x2_t const   acc_vec  = xacc[i];
+            uint64x2_t const   shifted  = vshrq_n_u64 (acc_vec, 47);
+            uint64x2_t const   data_vec = veorq_u64   (acc_vec, shifted);
+
+            /* key_vec  = xsecret[i]; */
+            uint32x4_t const   key_vec  = vreinterpretq_u32_u8(vld1q_u8(xsecret + (i * 16)));
+            /* data_key = data_vec ^ key_vec; */
+            uint32x4_t const   data_key = veorq_u32   (vreinterpretq_u32_u64(data_vec), key_vec);
+            /* shuffled = { data_key[0, 2], data_key[1, 3] }; */
+            uint32x2x2_t const shuffled = vzip_u32    (vget_low_u32(data_key), vget_high_u32(data_key));
+
+            /* data_key *= PRIME32_1 */
+
+            /* prod_hi = (data_key >> 32) * PRIME32_1; */
+            uint64x2_t const   prod_hi = vmull_u32    (shuffled.val[1], prime);
+            /* xacc[i] = prod_hi << 32; */
+            xacc[i] = vshlq_n_u64(prod_hi, 32);
+            /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
+            xacc[i] = vmlal_u32(xacc[i], shuffled.val[0], prime);
+    }   }
+
+#elif (XXH_VECTOR == XXH_VSX)
+
+          U64x2* const xacc =       (U64x2*) acc;
+    const U64x2* const xsecret = (const U64x2*) secret;
+    /* constants */
+    U64x2 const v32  = { 32, 32 };
+    U64x2 const v47 = { 47, 47 };
+    U32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 };
+    size_t i;
+#if XXH_VSX_BE
+    /* endian swap */
+    U8x16 const vXorSwap  = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
+                              0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 };
+#endif
+    for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
+        U64x2 const acc_vec  = xacc[i];
+        U64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
+        /* key_vec = xsecret[i]; */
+#if XXH_VSX_BE
+        /* swap bytes words */
+        U64x2 const key_raw  = vec_vsx_ld(0, xsecret + i);
+        U64x2 const data_key = (U64x2)XXH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap);
+#else
+        U64x2 const key_vec  = vec_vsx_ld(0, xsecret + i);
+        U64x2 const data_key = data_vec ^ key_vec;
+#endif
+
+        /* data_key *= PRIME32_1 */
+
+        /* prod_lo = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)prime & 0xFFFFFFFF);  */
+        U64x2 const prod_even  = XXH_vec_mule((U32x4)data_key, prime);
+        /* prod_hi = ((U64x2)data_key >> 32) * ((U64x2)prime >> 32);  */
+        U64x2 const prod_odd  = XXH_vec_mulo((U32x4)data_key, prime);
+        xacc[i] = prod_odd + (prod_even << v32);
+    }
+
+#else   /* scalar variant of Scrambler - universal */
+
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    size_t i;
+    XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
+    for (i=0; i < ACC_NB; i++) {
+        xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
+        xxh_u64 acc64 = xacc[i];
+        acc64 ^= acc64 >> 47;
+        acc64 ^= key64;
+        acc64 *= PRIME32_1;
+        xacc[i] = acc64;
+    }
+
+#endif
+}
+
+#define XXH_PREFETCH_DIST 384
+
+/* assumption : nbStripes will not overflow secret size */
+XXH_FORCE_INLINE void
+XXH3_accumulate(     xxh_u64* XXH_RESTRICT acc,
+                     const xxh_u8* XXH_RESTRICT input,
+                     const xxh_u8* XXH_RESTRICT secret,
+                     size_t nbStripes,
+                     XXH3_accWidth_e accWidth)
+{
+    size_t n;
+    for (n = 0; n < nbStripes; n++ ) {
+        const xxh_u8* const in = input + n*STRIPE_LEN;
+        XXH_PREFETCH(in + XXH_PREFETCH_DIST);
+        XXH3_accumulate_512(acc,
+                            in,
+                            secret + n*XXH_SECRET_CONSUME_RATE,
+                            accWidth);
+    }
+}
+
+/* note : clang auto-vectorizes well in SS2 mode _if_ this function is `static`,
+ *        and doesn't auto-vectorize it at all if it is `FORCE_INLINE`.
+ *        However, it auto-vectorizes better AVX2 if it is `FORCE_INLINE`
+ *        Pretty much every other modes and compilers prefer `FORCE_INLINE`.
+ */
+
+#if defined(__clang__) && (XXH_VECTOR==0) && !defined(__AVX2__) && !defined(__arm__) && !defined(__thumb__)
+static void
+#else
+XXH_FORCE_INLINE void
+#endif
+XXH3_hashLong_internal_loop( xxh_u64* XXH_RESTRICT acc,
+                             const xxh_u8* XXH_RESTRICT input, size_t len,
+                             const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                             XXH3_accWidth_e accWidth)
+{
+    size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
+    size_t const block_len = STRIPE_LEN * nb_rounds;
+    size_t const nb_blocks = len / block_len;
+
+    size_t n;
+
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+
+    for (n = 0; n < nb_blocks; n++) {
+        XXH3_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth);
+        XXH3_scrambleAcc(acc, secret + secretSize - STRIPE_LEN);
+    }
+
+    /* last partial block */
+    XXH_ASSERT(len > STRIPE_LEN);
+    {   size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN;
+        XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
+        XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth);
+
+        /* last stripe */
+        if (len & (STRIPE_LEN - 1)) {
+            const xxh_u8* const p = input + len - STRIPE_LEN;
+#define XXH_SECRET_LASTACC_START 7  /* do not align on 8, so that secret is different from scrambler */
+            XXH3_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXH_SECRET_LASTACC_START, accWidth);
+        }   }
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
+{
+    return XXH3_mul128_fold64(
+            acc[0] ^ XXH_readLE64(secret),
+            acc[1] ^ XXH_readLE64(secret+8) );
+}
+
+static XXH64_hash_t
+XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
+{
+    xxh_u64 result64 = start;
+
+    result64 += XXH3_mix2Accs(acc+0, secret +  0);
+    result64 += XXH3_mix2Accs(acc+2, secret + 16);
+    result64 += XXH3_mix2Accs(acc+4, secret + 32);
+    result64 += XXH3_mix2Accs(acc+6, secret + 48);
+
+    return XXH3_avalanche(result64);
+}
+
+#define XXH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \
+                        PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 };
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
+                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_64bits);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+#define XXH_SECRET_MERGEACCS_START 11  /* do not align on 8, so that secret is different from accumulator */
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
+}
+
+
+XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3_hashLong_64b_defaultSecret(const xxh_u8* XXH_RESTRICT input, size_t len)
+{
+    return XXH3_hashLong_internal(input, len, kSecret, sizeof(kSecret));
+}
+
+XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3_hashLong_64b_withSecret(const xxh_u8* XXH_RESTRICT input, size_t len,
+                             const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    return XXH3_hashLong_internal(input, len, secret, secretSize);
+}
+
+
+XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
+{
+    if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
+    memcpy(dst, &v64, sizeof(v64));
+}
+
+/* XXH3_initCustomSecret() :
+ * destination `customSecret` is presumed allocated and same size as `kSecret`.
+ */
+XXH_FORCE_INLINE void XXH3_initCustomSecret(xxh_u8* customSecret, xxh_u64 seed64)
+{
+    int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
+    int i;
+
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+
+    for (i=0; i < nbRounds; i++) {
+        XXH_writeLE64(customSecret + 16*i,     XXH_readLE64(kSecret + 16*i)     + seed64);
+        XXH_writeLE64(customSecret + 16*i + 8, XXH_readLE64(kSecret + 16*i + 8) - seed64);
+    }
+}
+
+
+/* XXH3_hashLong_64b_withSeed() :
+ * Generate a custom key,
+ * based on alteration of default kSecret with the seed,
+ * and then use this key for long mode hashing.
+ * This operation is decently fast but nonetheless costs a little bit of time.
+ * Try to avoid it whenever possible (typically when seed==0).
+ */
+XXH_NO_INLINE XXH64_hash_t    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
+{
+    XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    if (seed==0) return XXH3_hashLong_64b_defaultSecret(input, len);
+    XXH3_initCustomSecret(secret, seed);
+    return XXH3_hashLong_internal(input, len, secret, sizeof(secret));
+}
+
+
+XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
+                                     const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
+{
+    xxh_u64 const input_lo = XXH_readLE64(input);
+    xxh_u64 const input_hi = XXH_readLE64(input+8);
+    return XXH3_mul128_fold64(
+            input_lo ^ (XXH_readLE64(secret)   + seed64),
+            input_hi ^ (XXH_readLE64(secret+8) - seed64) );
+}
+
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                     const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                     XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   xxh_u64 acc = len * PRIME64_1;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc += XXH3_mix16B(input+48, secret+96, seed);
+                    acc += XXH3_mix16B(input+len-64, secret+112, seed);
+                }
+                acc += XXH3_mix16B(input+32, secret+64, seed);
+                acc += XXH3_mix16B(input+len-48, secret+80, seed);
+            }
+            acc += XXH3_mix16B(input+16, secret+32, seed);
+            acc += XXH3_mix16B(input+len-32, secret+48, seed);
+        }
+        acc += XXH3_mix16B(input+0, secret+0, seed);
+        acc += XXH3_mix16B(input+len-16, secret+16, seed);
+
+        return XXH3_avalanche(acc);
+    }
+}
+
+#define XXH3_MIDSIZE_MAX 240
+
+XXH_NO_INLINE XXH64_hash_t
+XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+#define XXH3_MIDSIZE_STARTOFFSET 3
+#define XXH3_MIDSIZE_LASTOFFSET  17
+
+    {   xxh_u64 acc = len * PRIME64_1;
+        int const nbRounds = (int)len / 16;
+        int i;
+        for (i=0; i<8; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
+        }
+        acc = XXH3_avalanche(acc);
+        XXH_ASSERT(nbRounds >= 8);
+        for (i=8 ; i < nbRounds; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
+        }
+        /* last bytes */
+        acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
+        return XXH3_avalanche(acc);
+    }
+}
+
+/* ===   Public entry point   === */
+
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len)
+{
+    if (len <= 16) return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0);
+    if (len <= 128) return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    return XXH3_hashLong_64b_defaultSecret((const xxh_u8*)input, len);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    /* if an action must be taken should `secret` conditions not be respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash */
+    if (len <= 16) return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
+    if (len <= 128) return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    return XXH3_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+    if (len <= 16) return XXH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed);
+    if (len <= 128) return XXH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    return XXH3_hashLong_64b_withSeed((const xxh_u8*)input, len, seed);
+}
+
+/* ===   XXH3 streaming   === */
+
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
+{
+    return (XXH3_state_t*)XXH_malloc(sizeof(XXH3_state_t));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API void
+XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
+{
+    memcpy(dst_state, src_state, sizeof(*dst_state));
+}
+
+static void
+XXH3_64bits_reset_internal(XXH3_state_t* statePtr,
+                           XXH64_hash_t seed,
+                           const xxh_u8* secret, size_t secretSize)
+{
+    XXH_ASSERT(statePtr != NULL);
+    memset(statePtr, 0, sizeof(*statePtr));
+    statePtr->acc[0] = PRIME32_3;
+    statePtr->acc[1] = PRIME64_1;
+    statePtr->acc[2] = PRIME64_2;
+    statePtr->acc[3] = PRIME64_3;
+    statePtr->acc[4] = PRIME64_4;
+    statePtr->acc[5] = PRIME32_2;
+    statePtr->acc[6] = PRIME64_5;
+    statePtr->acc[7] = PRIME32_1;
+    statePtr->seed = seed;
+    XXH_ASSERT(secret != NULL);
+    statePtr->secret = secret;
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    statePtr->secretLimit = (XXH32_hash_t)(secretSize - STRIPE_LEN);
+    statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset(XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_64bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_64bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_64bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    XXH3_initCustomSecret(statePtr->customSecret, seed);
+    statePtr->secret = statePtr->customSecret;
+    return XXH_OK;
+}
+
+XXH_FORCE_INLINE void
+XXH3_consumeStripes( xxh_u64* acc,
+                     XXH32_hash_t* nbStripesSoFarPtr, XXH32_hash_t nbStripesPerBlock,
+                     const xxh_u8* input, size_t totalStripes,
+                     const xxh_u8* secret, size_t secretLimit,
+                     XXH3_accWidth_e accWidth)
+{
+    XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
+    if (nbStripesPerBlock - *nbStripesSoFarPtr <= totalStripes) {
+        /* need a scrambling operation */
+        size_t const nbStripes = nbStripesPerBlock - *nbStripesSoFarPtr;
+        XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, accWidth);
+        XXH3_scrambleAcc(acc, secret + secretLimit);
+        XXH3_accumulate(acc, input + nbStripes * STRIPE_LEN, secret, totalStripes - nbStripes, accWidth);
+        *nbStripesSoFarPtr = (XXH32_hash_t)(totalStripes - nbStripes);
+    } else {
+        XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, totalStripes, accWidth);
+        *nbStripesSoFarPtr += (XXH32_hash_t)totalStripes;
+    }
+}
+
+XXH_FORCE_INLINE XXH_errorcode
+XXH3_update(XXH3_state_t* state, const xxh_u8* input, size_t len, XXH3_accWidth_e accWidth)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* const bEnd = input + len;
+
+        state->totalLen += len;
+
+        if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {  /* fill in tmp buffer */
+            XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+            state->bufferedSize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+            /* input now > XXH3_INTERNALBUFFER_SIZE */
+
+#define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / STRIPE_LEN)
+        XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % STRIPE_LEN == 0);   /* clean multiple */
+
+        if (state->bufferedSize) {   /* some input within internal buffer: fill then consume it */
+            size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
+            XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
+            input += loadSize;
+            XXH3_consumeStripes(state->acc,
+                                &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                state->buffer, XXH3_INTERNALBUFFER_STRIPES,
+                                state->secret, state->secretLimit,
+                                accWidth);
+            state->bufferedSize = 0;
+        }
+
+        /* consume input by full buffer quantities */
+        if (input+XXH3_INTERNALBUFFER_SIZE <= bEnd) {
+            const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
+            do {
+                XXH3_consumeStripes(state->acc,
+                                    &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                    input, XXH3_INTERNALBUFFER_STRIPES,
+                                    state->secret, state->secretLimit,
+                                    accWidth);
+                input += XXH3_INTERNALBUFFER_SIZE;
+            } while (input<=limit);
+        }
+
+        if (input < bEnd) { /* some remaining input input : buffer it */
+            XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
+            state->bufferedSize = (XXH32_hash_t)(bEnd-input);
+        }
+    }
+
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_64bits);
+}
+
+
+XXH_FORCE_INLINE void
+XXH3_digest_long (XXH64_hash_t* acc, const XXH3_state_t* state, XXH3_accWidth_e accWidth)
+{
+    memcpy(acc, state->acc, sizeof(state->acc));  /* digest locally, state remains unaltered, and can continue ingesting more input afterwards */
+    if (state->bufferedSize >= STRIPE_LEN) {
+        size_t const totalNbStripes = state->bufferedSize / STRIPE_LEN;
+        XXH32_hash_t nbStripesSoFar = state->nbStripesSoFar;
+        XXH3_consumeStripes(acc,
+                            &nbStripesSoFar, state->nbStripesPerBlock,
+                            state->buffer, totalNbStripes,
+                            state->secret, state->secretLimit,
+                            accWidth);
+        if (state->bufferedSize % STRIPE_LEN) {  /* one last partial stripe */
+            XXH3_accumulate_512(acc,
+                                state->buffer + state->bufferedSize - STRIPE_LEN,
+                                state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
+                                accWidth);
+        }
+    } else {  /* bufferedSize < STRIPE_LEN */
+        if (state->bufferedSize) { /* one last stripe */
+            xxh_u8 lastStripe[STRIPE_LEN];
+            size_t const catchupSize = STRIPE_LEN - state->bufferedSize;
+            memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
+            memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
+            XXH3_accumulate_512(acc,
+                                lastStripe,
+                                state->secret + state->secretLimit - XXH_SECRET_LASTACC_START,
+                                accWidth);
+        }   }
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
+{
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
+        XXH3_digest_long(acc, state, XXH3_acc_64bits);
+        return XXH3_mergeAccs(acc, state->secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)state->totalLen * PRIME64_1);
+    }
+    /* len <= XXH3_MIDSIZE_MAX : short code */
+    if (state->seed)
+        return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN);
+}
+
+/* ==========================================
+ * XXH3 128 bits (=> XXH128)
+ * ========================================== */
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32  const combinedl = ((xxh_u32)c1) + (((xxh_u32)c2) << 8) + (((xxh_u32)c3) << 16) + (((xxh_u32)len) << 24);
+        xxh_u32  const combinedh = XXH_swap32(combinedl);
+        xxh_u64  const keyed_lo = (xxh_u64)combinedl ^ (XXH_readLE32(secret)   + seed);
+        xxh_u64  const keyed_hi = (xxh_u64)combinedh ^ (XXH_readLE32(secret+4) - seed);
+        xxh_u64  const mixedl = keyed_lo * PRIME64_1;
+        xxh_u64  const mixedh = keyed_hi * PRIME64_5;
+        XXH128_hash_t const h128 = { XXH3_avalanche(mixedl) /*low64*/, XXH3_avalanche(mixedh) /*high64*/ };
+        return h128;
+    }
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    {   xxh_u32 const input_lo = XXH_readLE32(input);
+        xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+        xxh_u64 const input_64_lo = input_lo + ((xxh_u64)input_hi << 32);
+        xxh_u64 const input_64_hi = XXH_swap64(input_64_lo);
+        xxh_u64 const keyed_lo = input_64_lo ^ (XXH_readLE64(secret) + seed);
+        xxh_u64 const keyed_hi = input_64_hi ^ (XXH_readLE64(secret + 8) - seed);
+        xxh_u64 const mix64l1 = len + ((keyed_lo ^ (keyed_lo >> 51)) * PRIME32_1);
+        xxh_u64 const mix64l2 = (mix64l1 ^ (mix64l1 >> 47)) * PRIME64_2;
+        xxh_u64 const mix64h1 = ((keyed_hi ^ (keyed_hi >> 47)) * PRIME64_1) - len;
+        xxh_u64 const mix64h2 = (mix64h1 ^ (mix64h1 >> 43)) * PRIME64_4;
+        {   XXH128_hash_t const h128 = { XXH3_avalanche(mix64l2) /*low64*/, XXH3_avalanche(mix64h2) /*high64*/ };
+            return h128;
+        }   }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const input_lo = XXH_readLE64(input) ^ (XXH_readLE64(secret) + seed);
+        xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ (XXH_readLE64(secret+8) - seed);
+        XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi, PRIME64_1);
+        xxh_u64 const lenContrib = XXH_mult32to64(len, PRIME32_5);
+        m128.low64 += lenContrib;
+        m128.high64 += input_hi * PRIME64_1;
+        m128.low64  ^= (m128.high64 >> 32);
+        {   XXH128_hash_t h128 = XXH_mult64to128(m128.low64, PRIME64_2);
+            h128.high64 += m128.high64 * PRIME64_2;
+            h128.low64   = XXH3_avalanche(h128.low64);
+            h128.high64  = XXH3_avalanche(h128.high64);
+            return h128;
+        }   }
+}
+
+/* Assumption : `secret` size is >= 16
+ * Note : it should be >= XXH3_SECRET_SIZE_MIN anyway */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
+        if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
+        {   XXH128_hash_t const h128 = { 0, 0 };
+            return h128;
+        }   }
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_internal(const xxh_u8* XXH_RESTRICT input, size_t len,
+                            const xxh_u8* XXH_RESTRICT secret, size_t secretSize)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXH3_acc_128bits);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    {   xxh_u64 const low64 = XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1);
+        xxh_u64 const high64 = XXH3_mergeAccs(acc, secret + secretSize - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)len * PRIME64_2));
+        XXH128_hash_t const h128 = { low64, high64 };
+        return h128;
+    }
+}
+
+XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3_hashLong_128b_defaultSecret(const xxh_u8* input, size_t len)
+{
+    return XXH3_hashLong_128b_internal(input, len, kSecret, sizeof(kSecret));
+}
+
+XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3_hashLong_128b_withSecret(const xxh_u8* input, size_t len,
+                              const xxh_u8* secret, size_t secretSize)
+{
+    return XXH3_hashLong_128b_internal(input, len, secret, secretSize);
+}
+
+XXH_NO_INLINE XXH128_hash_t    /* It's important for performance that XXH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */
+XXH3_hashLong_128b_withSeed(const xxh_u8* input, size_t len, XXH64_hash_t seed)
+{
+    XXH_ALIGN(8) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    if (seed == 0) return XXH3_hashLong_128b_defaultSecret(input, len);
+    XXH3_initCustomSecret(secret, seed);
+    return XXH3_hashLong_128b_internal(input, len, secret, sizeof(secret));
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    acc.low64  += XXH3_mix16B (input_1, secret+0, seed);
+    acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
+    acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
+    acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
+    return acc;
+}
+
+XXH_NO_INLINE XXH128_hash_t
+XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                       XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    {   XXH128_hash_t acc;
+        int const nbRounds = (int)len / 32;
+        int i;
+        acc.low64 = len * PRIME64_1;
+        acc.high64 = 0;
+        for (i=0; i<4; i++) {
+            acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+(32*i), seed);
+        }
+        acc.low64 = XXH3_avalanche(acc.low64);
+        acc.high64 = XXH3_avalanche(acc.high64);
+        XXH_ASSERT(nbRounds >= 4);
+        for (i=4 ; i < nbRounds; i++) {
+            acc = XXH128_mix32B(acc, input+(32*i), input+(32*i)+16, secret+XXH3_MIDSIZE_STARTOFFSET+(32*(i-4)), seed);
+        }
+        /* last bytes */
+        acc = XXH128_mix32B(acc, input + len - 16, input + len - 32, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, 0ULL - seed);
+
+        {   xxh_u64 const low64 = acc.low64 + acc.high64;
+            xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
+            XXH128_hash_t const h128 = { XXH3_avalanche(low64), (XXH64_hash_t)0 - XXH3_avalanche(high64) };
+            return h128;
+        }
+    }
+}
+
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   XXH128_hash_t acc;
+        acc.low64 = len * PRIME64_1;
+        acc.high64 = 0;
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
+                }
+                acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
+            }
+            acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
+        }
+        acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
+        {   xxh_u64 const low64 = acc.low64 + acc.high64;
+            xxh_u64 const high64 = (acc.low64 * PRIME64_1) + (acc.high64 * PRIME64_4) + ((len - seed) * PRIME64_2);
+            XXH128_hash_t const h128 = { XXH3_avalanche(low64), (XXH64_hash_t)0 - XXH3_avalanche(high64) };
+            return h128;
+        }
+    }
+}
+
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
+{
+    if (len <= 16) return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, 0);
+    if (len <= 128) return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0);
+    return XXH3_hashLong_128b_defaultSecret((const xxh_u8*)input, len);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    /* if an action must be taken should `secret` conditions not be respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash */
+    if (len <= 16) return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0);
+    if (len <= 128) return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0);
+    return XXH3_hashLong_128b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
+{
+    if (len <= 16) return XXH3_len_0to16_128b((const xxh_u8*)input, len, kSecret, seed);
+    if (len <= 128) return XXH3_len_17to128_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    if (len <= XXH3_MIDSIZE_MAX) return XXH3_len_129to240_128b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed);
+    return XXH3_hashLong_128b_withSeed((const xxh_u8*)input, len, seed);
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH128(const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_withSeed(input, len, seed);
+}
+
+
+/* ===   XXH3 128-bit streaming   === */
+
+/* all the functions are actually the same as for 64-bit streaming variant,
+   just the reset one is different (different initial acc values for 0,5,6,7),
+   and near the end of the digest function */
+
+static void
+XXH3_128bits_reset_internal(XXH3_state_t* statePtr,
+                            XXH64_hash_t seed,
+                            const xxh_u8* secret, size_t secretSize)
+{
+    XXH3_64bits_reset_internal(statePtr, seed, secret, secretSize);
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset(XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_128bits_reset_internal(statePtr, 0, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_128bits_reset_internal(statePtr, 0, (const xxh_u8*)secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_128bits_reset_internal(statePtr, seed, kSecret, XXH_SECRET_DEFAULT_SIZE);
+    XXH3_initCustomSecret(statePtr->customSecret, seed);
+    statePtr->secret = statePtr->customSecret;
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len, XXH3_acc_128bits);
+}
+
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
+{
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[ACC_NB];
+        XXH3_digest_long(acc, state, XXH3_acc_128bits);
+        XXH_ASSERT(state->secretLimit + STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+        {   xxh_u64 const low64 = XXH3_mergeAccs(acc, state->secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)state->totalLen * PRIME64_1);
+            xxh_u64 const high64 = XXH3_mergeAccs(acc, state->secret + state->secretLimit + STRIPE_LEN - sizeof(acc) - XXH_SECRET_MERGEACCS_START, ~((xxh_u64)state->totalLen * PRIME64_2));
+            XXH128_hash_t const h128 = { low64, high64 };
+            return h128;
+        }
+    }
+    /* len <= XXH3_MIDSIZE_MAX : short code */
+    if (state->seed)
+        return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), state->secret, state->secretLimit + STRIPE_LEN);
+}
+
+/* 128-bit utility functions */
+
+#include <string.h>   /* memcmp */
+
+/* return : 1 is equal, 0 if different */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
+{
+    /* note : XXH128_hash_t is compact, it has no padding byte */
+    return !(memcmp(&h1, &h2, sizeof(h1)));
+}
+
+/* This prototype is compatible with stdlib's qsort().
+ * return : >0 if *h128_1  > *h128_2
+ *          <0 if *h128_1  < *h128_2
+ *          =0 if *h128_1 == *h128_2  */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
+{
+    XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
+    XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
+    int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
+    /* note : bets that, in most cases, hash values are different */
+    if (hcmp) return hcmp;
+    return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
+}
+
+
+/*======   Canonical representation   ======*/
+XXH_PUBLIC_API void
+XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) {
+        hash.high64 = XXH_swap64(hash.high64);
+        hash.low64  = XXH_swap64(hash.low64);
+    }
+    memcpy(dst, &hash.high64, sizeof(hash.high64));
+    memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+}
+
+XXH_PUBLIC_API XXH128_hash_t
+XXH128_hashFromCanonical(const XXH128_canonical_t* src)
+{
+    XXH128_hash_t h;
+    h.high64 = XXH_readBE64(src);
+    h.low64  = XXH_readBE64(src->digest + 8);
+    return h;
+}
+
+
+
+#endif  /* XXH3_H */
diff --git a/lib/xxhash/xxhash.cpp b/lib/xxhash/xxhash.cpp
new file mode 100644
index 0000000..1e4df67
--- /dev/null
+++ b/lib/xxhash/xxhash.cpp
@@ -0,0 +1,5 @@
+//
+// Created by Martin Steinegger on 2020-01-14.
+//
+
+#include "xxhash.h"
diff --git a/lib/xxhash/xxhash.h b/lib/xxhash/xxhash.h
new file mode 100644
index 0000000..3593b91
--- /dev/null
+++ b/lib/xxhash/xxhash.h
@@ -0,0 +1,1671 @@
+/*
+   xxHash - Extremely Fast Hash algorithm
+   Header File
+   Copyright (C) 2012-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+/* Notice extracted from xxHash homepage :
+
+xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name            Speed       Q.Score   Author
+xxHash          5.4 GB/s     10
+CrapWow         3.2 GB/s      2       Andrew
+MumurHash 3a    2.7 GB/s     10       Austin Appleby
+SpookyHash      2.0 GB/s     10       Bob Jenkins
+SBox            1.4 GB/s      9       Bret Mulvey
+Lookup3         1.2 GB/s      9       Bob Jenkins
+SuperFastHash   1.2 GB/s      1       Paul Hsieh
+CityHash64      1.05 GB/s    10       Pike & Alakuijala
+FNV             0.55 GB/s     5       Fowler, Noll, Vo
+CRC32           0.43 GB/s     9
+MD5-32          0.33 GB/s    10       Ronald L. Rivest
+SHA1-32         0.28 GB/s    10
+
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
+10 is a perfect score.
+
+Note : SMHasher's CRC32 implementation is not the fastest one.
+Other speed-oriented implementations can be faster,
+especially in combination with PCLMUL instruction :
+http://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735
+
+A 64-bit version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bit applications only.
+Name     Speed on 64 bits    Speed on 32 bits
+XXH64       13.8 GB/s            1.9 GB/s
+XXH32        6.8 GB/s            6.0 GB/s
+*/
+
+#define XXH_STATIC_LINKING_ONLY   /* access advanced declarations */
+#define XXH_IMPLEMENTATION   /* access definitions */
+
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+/* ****************************
+ *  API modifier
+ ******************************/
+/** XXH_INLINE_ALL (and XXH_PRIVATE_API)
+ *  This build macro includes xxhash functions in `static` mode
+ *  in order to inline them, and remove their symbol from the public list.
+ *  Inlining offers great performance improvement on small keys,
+ *  and dramatic ones when length is expressed as a compile-time constant.
+ *  See https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html .
+ *  Methodology :
+ *     #define XXH_INLINE_ALL
+ *     #include "xxhash.h"
+ * `xxhash.c` is automatically included.
+ *  It's not useful to compile and link it as a separate object.
+ */
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  ifndef XXH_STATIC_LINKING_ONLY
+#    define XXH_STATIC_LINKING_ONLY
+#  endif
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+     /* this version may generate warnings for unused static functions */
+#    define XXH_PUBLIC_API static
+#  endif
+#else
+#  if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    elif XXH_IMPORT
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+
+/*! XXH_NAMESPACE, aka Namespace Emulation :
+ *
+ * If you want to include _and expose_ xxHash functions from within your own library,
+ * but also want to avoid symbol collisions with other libraries which may also include xxHash,
+ *
+ * you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library
+ * with the value of XXH_NAMESPACE (therefore, avoid NULL and numeric values).
+ *
+ * Note that no change is required within the calling program as long as it includes `xxhash.h` :
+ * regular symbol name will be automatically translated by this header.
+ */
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+#endif
+
+
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    7
+#define XXH_VERSION_RELEASE  2
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+XXH_PUBLIC_API unsigned XXH_versionNumber (void);
+
+
+/* ****************************
+*  Definitions
+******************************/
+#include <stddef.h>   /* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+/*-**********************************************************************
+*  32-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+typedef uint32_t XXH32_hash_t;
+#else
+#   include <limits.h>
+#   if UINT_MAX == 0xFFFFFFFFUL
+      typedef unsigned int XXH32_hash_t;
+#   else
+#     if ULONG_MAX == 0xFFFFFFFFUL
+        typedef unsigned long XXH32_hash_t;
+#     else
+#       error "unsupported platform : need a 32-bit type"
+#     endif
+#   endif
+#endif
+
+/*! XXH32() :
+    Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input".
+    The memory between input & input+length must be valid (allocated and read-accessible).
+    "seed" can be used to alter the result predictably.
+    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
+
+/*******   Streaming   *******/
+
+/*
+ * Streaming functions generate the xxHash value from an incrememtal input.
+ * This method is slower than single-call functions, due to state management.
+ * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+ *
+ * XXH state must first be allocated, using XXH*_createState() .
+ *
+ * Start a new hash by initializing state with a seed, using XXH*_reset().
+ *
+ * Then, feed the hash state by calling XXH*_update() as many times as necessary.
+ * The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+ *
+ * Finally, a hash value can be produced anytime, by using XXH*_digest().
+ * This function returns the nn-bits hash as an int or long long.
+ *
+ * It's still possible to continue inserting input into the hash state after a digest,
+ * and generate some new hash values later on, by invoking again XXH*_digest().
+ *
+ * When done, release the state, using XXH*_freeState().
+ */
+
+typedef struct XXH32_state_s XXH32_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, XXH32_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
+
+/*******   Canonical representation   *******/
+
+/* Default return values from XXH functions are basic unsigned 32 and 64 bits.
+ * This the simplest and fastest format for further post-processing.
+ * However, this leaves open the question of what is the order of bytes,
+ * since little and big endian conventions will write the same number differently.
+ *
+ * The canonical representation settles this issue,
+ * by mandating big-endian convention,
+ * aka, the same convention as human-readable numbers (large digits first).
+ * When writing hash values to storage, sending them over a network, or printing them,
+ * it's highly recommended to use the canonical representation,
+ * to ensure portability across a wider range of systems, present and future.
+ *
+ * The following functions allow transformation of hash values into and from canonical format.
+ */
+
+typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+
+
+#ifndef XXH_NO_LONG_LONG
+/*-**********************************************************************
+*  64-bit hash
+************************************************************************/
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+typedef uint64_t XXH64_hash_t;
+#else
+/* the following type must have a width of 64-bit */
+    typedef unsigned long long XXH64_hash_t;
+#endif
+
+/*! XXH64() :
+ *  Returns the 64-bit hash of sequence of length @length stored at memory address @input.
+ *  @seed can be used to alter the result predictably.
+ *  This function runs faster on 64-bit systems, but slower on 32-bit systems (see benchmark).
+ */
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, XXH64_hash_t seed);
+
+/*******   Streaming   *******/
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH64_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
+
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[8]; } XXH64_canonical_t;
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+#endif /* XXHASH_H_5627135585666179 */
+
+
+
+#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
+#define XXHASH_H_STATIC_13879238742
+/* ************************************************************************************************
+   This section contains declarations which are not guaranteed to remain stable.
+   They may change in future versions, becoming incompatible with a different version of the library.
+   These declarations should only be used with static linking.
+   Never use them in association with dynamic linking !
+*************************************************************************************************** */
+
+/* These definitions are only present to allow
+ * static allocation of XXH state, on stack or in a struct for example.
+ * Never **ever** use members directly. */
+
+struct XXH32_state_s {
+   XXH32_hash_t total_len_32;
+   XXH32_hash_t large_len;
+   XXH32_hash_t v1;
+   XXH32_hash_t v2;
+   XXH32_hash_t v3;
+   XXH32_hash_t v4;
+   XXH32_hash_t mem32[4];
+   XXH32_hash_t memsize;
+   XXH32_hash_t reserved;   /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH32_state_t */
+
+
+#ifndef XXH_NO_LONG_LONG  /* defined when there is no 64-bit support */
+
+struct XXH64_state_s {
+   XXH64_hash_t total_len;
+   XXH64_hash_t v1;
+   XXH64_hash_t v2;
+   XXH64_hash_t v3;
+   XXH64_hash_t v4;
+   XXH64_hash_t mem64[4];
+   XXH32_hash_t memsize;
+   XXH32_hash_t reserved32;  /* required for padding anyway */
+   XXH64_hash_t reserved64;  /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH64_state_t */
+
+
+/*-**********************************************************************
+*  XXH3
+*  New experimental hash
+************************************************************************/
+
+/* *********************************************
+ * XXH3 is a new hash algorithm,
+ * featuring improved speed performance for both small and large inputs.
+ * See full speed analysis at : http://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+ * In general, expect XXH3 to run about ~2x faster on large inputs,
+ * and >3x faster on small ones, though exact differences depend on platform.
+ *
+ * The algorithm is portable, will generate the same hash on all platforms.
+ * It benefits greatly from vectorization units, but does not require it.
+ *
+ * XXH3 offers 2 variants, _64bits and _128bits.
+ * When only 64 bits are needed, prefer calling the _64bits variant :
+ * it reduces the amount of mixing, resulting in faster speed on small inputs.
+ * It's also generally simpler to manipulate a scalar return type than a struct.
+ *
+ * The XXH3 algorithm is still considered experimental.
+ * Produced results can still change between versions.
+ * Results produced by v0.7.x are not comparable with results from v0.7.y .
+ * It's nonetheless possible to use XXH3 for ephemeral data (local sessions),
+ * but avoid storing values in long-term storage for later reads.
+ *
+ * The API supports one-shot hashing, streaming mode, and custom secrets.
+ *
+ * There are still a number of opened questions that community can influence during the experimental period.
+ * I'm trying to list a few of them below, though don't consider this list as complete.
+ *
+ * - 128-bits output type : currently defined as a structure of two 64-bits fields.
+ *                          That's because 128-bit values do not exist in C standard.
+ *                          Note that it means that, at byte level, result is not identical depending on endianess.
+ *                          However, at field level, they are identical on all platforms.
+ *                          The canonical representation solves the issue of identical byte-level representation across platforms,
+ *                          which is necessary for serialization.
+ *                          Q1 : Would there be a better representation for a 128-bit hash result ?
+ *                          Q2 : Are the names of the inner 64-bit fields important ? Should they be changed ?
+ *
+ * - Prototype XXH128() :   XXH128() uses the same arguments as XXH64(), for consistency.
+ *                          It means it maps to XXH3_128bits_withSeed().
+ *                          This variant is slightly slower than XXH3_128bits(),
+ *                          because the seed is now part of the algorithm, and can't be simplified.
+ *                          Is that a good idea ?
+ *
+ * - Seed type for XXH128() : currently, it's a single 64-bit value, like the 64-bit variant.
+ *                          It could be argued that it's more logical to offer a 128-bit seed input parameter for a 128-bit hash.
+ *                          But 128-bit seed is more difficult to use, since it requires to pass a structure instead of a scalar value.
+ *                          Such a variant could either replace current one, or become an additional one.
+ *                          Farmhash, for example, offers both variants (the 128-bits seed variant is called `doubleSeed`).
+ *                          Follow up question : if both 64-bit and 128-bit seeds are allowed, which variant should be called XXH128 ?
+ *
+ * - Result for len==0 :    Currently, the result of hashing a zero-length input is always `0`.
+ *                          It seems okay as a return value when using "default" secret and seed.
+ *                          But is it still fine to return `0` when secret or seed are non-default ?
+ *                          Are there use cases which could depend on generating a different hash result for zero-length input when the secret is different ?
+ *
+ * - Consistency (1) :      Streaming XXH128 uses an XXH3 state, which is the same state as XXH3_64bits().
+ *                          It means a 128bit streaming loop must invoke the following symbols :
+ *                          XXH3_createState(), XXH3_128bits_reset(), XXH3_128bits_update() (loop), XXH3_128bits_digest(), XXH3_freeState().
+ *                          Is that consistent enough ?
+ *
+ * - Consistency (2) :      The canonical representation of `XXH3_64bits` is provided by existing functions
+ *                          XXH64_canonicalFromHash(), and reverse operation XXH64_hashFromCanonical().
+ *                          As a mirror, canonical functions for XXH128_hash_t results generated by `XXH3_128bits`
+ *                          are XXH128_canonicalFromHash() and XXH128_hashFromCanonical().
+ *                          Which means, `XXH3` doesn't appear in the names, because canonical functions operate on a type,
+ *                          independently of which algorithm was used to generate that type.
+ *                          Is that consistent enough ?
+ */
+
+#ifdef XXH_NAMESPACE
+#  define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
+#  define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
+#  define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
+
+#  define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
+#  define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
+#  define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
+
+#  define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
+#  define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
+#  define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
+#  define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
+#  define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
+#endif
+
+/* XXH3_64bits() :
+ * default 64-bit variant, using default secret and default seed of 0.
+ * It's the fastest variant. */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);
+
+/* XXH3_64bits_withSecret() :
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional collision.
+ * The secret *must* be large enough (>= XXH3_SECRET_SIZE_MIN).
+ * It should consist of random bytes.
+ * Avoid repeating same character, or sequences of bytes,
+ * and especially avoid swathes of \0.
+ * Failure to respect these conditions will result in a poor quality hash.
+ */
+#define XXH3_SECRET_SIZE_MIN 136
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+/* XXH3_64bits_withSeed() :
+ * This variant generates on the fly a custom secret,
+ * based on the default secret, altered using the `seed` value.
+ * While this operation is decently fast, note that it's not completely free.
+ * note : seed==0 produces same results as XXH3_64bits() */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
+
+
+/* streaming 64-bit */
+
+#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11+ */
+#  include <stdalign.h>
+#  define XXH_ALIGN(n)      alignas(n)
+#elif defined(__GNUC__)
+#  define XXH_ALIGN(n)      __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#  define XXH_ALIGN(n)      __declspec(align(n))
+#else
+#  define XXH_ALIGN(n)   /* disabled */
+#endif
+
+typedef struct XXH3_state_s XXH3_state_t;
+
+#define XXH3_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+#define XXH3_INTERNALBUFFER_SIZE 256
+struct XXH3_state_s {
+   XXH_ALIGN(64) XXH64_hash_t acc[8];
+   XXH_ALIGN(64) unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE];  /* used to store a custom secret generated from the seed. Makes state larger. Design might change */
+   XXH_ALIGN(64) unsigned char buffer[XXH3_INTERNALBUFFER_SIZE];
+   XXH32_hash_t bufferedSize;
+   XXH32_hash_t nbStripesPerBlock;
+   XXH32_hash_t nbStripesSoFar;
+   XXH32_hash_t secretLimit;
+   XXH32_hash_t reserved32;
+   XXH32_hash_t reserved32_2;
+   XXH64_hash_t totalLen;
+   XXH64_hash_t seed;
+   XXH64_hash_t reserved64;
+   const unsigned char* secret;    /* note : there is some padding after, due to alignment on 64 bytes */
+};   /* typedef'd to XXH3_state_t */
+
+/* Streaming requires state maintenance.
+ * This operation costs memory and cpu.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer using one-shot functions whenever possible. */
+
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void);
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
+XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);
+
+
+/* XXH3_64bits_reset() :
+ * initialize with default parameters.
+ * result will be equivalent to `XXH3_64bits()`. */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);
+/* XXH3_64bits_reset_withSeed() :
+ * generate a custom secret from `seed`, and store it into state.
+ * digest will be equivalent to `XXH3_64bits_withSeed()`. */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+/* XXH3_64bits_reset_withSecret() :
+ * `secret` is referenced, and must outlive the hash streaming session.
+ * secretSize must be >= XXH3_SECRET_SIZE_MIN.
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH3_64bits_digest (const XXH3_state_t* statePtr);
+
+
+/* 128-bit */
+
+#ifdef XXH_NAMESPACE
+#  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
+#  define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
+#  define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
+#  define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
+
+#  define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
+#  define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
+#  define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
+#  define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
+#  define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
+
+#  define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
+#  define XXH128_cmp     XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
+#  define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
+#  define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
+#endif
+
+typedef struct {
+    XXH64_hash_t low64;
+    XXH64_hash_t high64;
+} XXH128_hash_t;
+
+XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);  /* == XXH128() */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
+
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);
+
+
+/* Note : for better performance, following functions can be inlined,
+ * using XXH_INLINE_ALL */
+
+/* return : 1 is equal, 0 if different */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
+
+/* This comparator is compatible with stdlib's qsort().
+ * return : >0 if *h128_1  > *h128_2
+ *          <0 if *h128_1  < *h128_2
+ *          =0 if *h128_1 == *h128_2  */
+XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);
+
+
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[16]; } XXH128_canonical_t;
+XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
+XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  define XXH_IMPLEMENTATION
+#endif
+
+#endif  /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */
+
+
+
+/*-**********************************************************************
+*  xxHash implementation
+*  Functions implementation used to be hosted within xxhash.c .
+*  However, code inlining requires to place implementation in the header file.
+*  As a consequence, xxhash.c used to be included within xxhash.h .
+*  But some build systems don't like *.c inclusions.
+*  So the implementation is now directly integrated within xxhash.h .
+*  Another small advantage is that xxhash.c is no longer required in /includes .
+************************************************************************/
+
+#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \
+   || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)
+#  define XXH_IMPLEM_13a8737387
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+/*!XXH_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method doesn't depend on compiler but violate C standard.
+ *            It can generate buggy code on targets which do not support unaligned memory accesses.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://stackoverflow.com/a/32095106/646947 for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6)
+#    define XXH_FORCE_MEMORY_ACCESS 2
+#  elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
+  (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+/*!XXH_ACCEPT_NULL_INPUT_POINTER :
+ * If input pointer is NULL, xxHash default behavior is to dereference it, triggering a segfault.
+ * When this macro is enabled, xxHash actively checks input for null pointer.
+ * It it is, result for null input pointers is the same as a null-length input.
+ */
+#ifndef XXH_ACCEPT_NULL_INPUT_POINTER   /* can be defined externally */
+#  define XXH_ACCEPT_NULL_INPUT_POINTER 0
+#endif
+
+/*!XXH_FORCE_ALIGN_CHECK :
+ * This is a minor performance trick, only useful with lots of very small keys.
+ * It means : check for aligned/unaligned input.
+ * The check costs one initial branch per hash;
+ * set it to 0 when the input is guaranteed to be aligned,
+ * or when alignment doesn't matter for performance.
+ */
+#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
+#  if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+/*!XXH_REROLL:
+ * Whether to reroll XXH32_finalize, and XXH64_finalize,
+ * instead of using an unrolled jump table/if statement loop.
+ *
+ * This is automatically defined on -Os/-Oz on GCC and Clang. */
+#ifndef XXH_REROLL
+#  if defined(__OPTIMIZE_SIZE__)
+#    define XXH_REROLL 1
+#  else
+#    define XXH_REROLL 0
+#  endif
+#endif
+
+
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+/*! Modify the local functions below should you wish to use some other memory routines
+*   for malloc(), free() */
+#include <stdlib.h>
+static void* XXH_malloc(size_t s) { return malloc(s); }
+static void  XXH_free  (void* p)  { free(p); }
+/*! and for memcpy() */
+#include <string.h>
+static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
+
+#include <limits.h>   /* ULLONG_MAX */
+
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)      /* disable: C4127: conditional expression is constant */
+#  define XXH_FORCE_INLINE static __forceinline
+#  define XXH_NO_INLINE static __declspec(noinline)
+#else
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define XXH_FORCE_INLINE static inline __attribute__((always_inline))
+#      define XXH_NO_INLINE static __attribute__((noinline))
+#    else
+#      define XXH_FORCE_INLINE static inline
+#      define XXH_NO_INLINE static
+#    endif
+#  else
+#    define XXH_FORCE_INLINE static
+#    define XXH_NO_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+
+/* *************************************
+*  Debug
+***************************************/
+/* DEBUGLEVEL is expected to be defined externally,
+ * typically through compiler command line.
+ * Value must be a number. */
+#ifndef DEBUGLEVEL
+#  define DEBUGLEVEL 0
+#endif
+
+#if (DEBUGLEVEL>=1)
+#  include <assert.h>   /* note : can still be disabled with NDEBUG */
+#  define XXH_ASSERT(c)   assert(c)
+#else
+#  define XXH_ASSERT(c)   ((void)0)
+#endif
+
+/* note : use after variable declarations */
+#define XXH_STATIC_ASSERT(c)  { enum { XXH_sa = 1/(int)(!!(c)) }; }
+
+
+/* *************************************
+*  Basic Types
+***************************************/
+#if !defined (__VMS) \
+ && (defined (__cplusplus) \
+ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef uint8_t  xxh_u8;
+#else
+  typedef unsigned char      xxh_u8;
+#endif
+typedef XXH32_hash_t xxh_u32;
+
+
+/* ***   Memory access   *** */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
+static xxh_u32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+static xxh_u32 XXH_read32(const void* memPtr)
+{
+    xxh_u32 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ***   Endianess   *** */
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+
+/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+#  if defined(_WIN32) /* Windows is always little endian */ \
+     || defined(__LITTLE_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 1
+#  elif defined(__BIG_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 0
+#  else
+static int XXH_isLittleEndian(void)
+{
+    const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
+#  endif
+#endif
+
+
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#ifndef __has_builtin
+#  define __has_builtin(x) 0
+#endif
+
+#if !defined(NO_CLANG_BUILTIN) && __has_builtin(__builtin_rotateleft32) && __has_builtin(__builtin_rotateleft64)
+#  define XXH_rotl32 __builtin_rotateleft32
+#  define XXH_rotl64 __builtin_rotateleft64
+/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
+#elif defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
+#endif
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap32 _byteswap_ulong
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#else
+static xxh_u32 XXH_swap32 (xxh_u32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+}
+
+static xxh_u32 XXH_readBE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+
+XXH_FORCE_INLINE xxh_u32
+XXH_readLE32_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned) {
+        return XXH_readLE32(ptr);
+    } else {
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);
+    }
+}
+
+
+/* *************************************
+*  Misc
+***************************************/
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+
+
+/* *******************************************************************
+*  32-bit hash functions
+*********************************************************************/
+static const xxh_u32 PRIME32_1 = 0x9E3779B1U;   /* 0b10011110001101110111100110110001 */
+static const xxh_u32 PRIME32_2 = 0x85EBCA77U;   /* 0b10000101111010111100101001110111 */
+static const xxh_u32 PRIME32_3 = 0xC2B2AE3DU;   /* 0b11000010101100101010111000111101 */
+static const xxh_u32 PRIME32_4 = 0x27D4EB2FU;   /* 0b00100111110101001110101100101111 */
+static const xxh_u32 PRIME32_5 = 0x165667B1U;   /* 0b00010110010101100110011110110001 */
+
+static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
+{
+    acc += input * PRIME32_2;
+    acc  = XXH_rotl32(acc, 13);
+    acc *= PRIME32_1;
+#if defined(__GNUC__) && defined(__SSE4_1__) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+    /* UGLY HACK:
+     * This inline assembly hack forces acc into a normal register. This is the
+     * only thing that prevents GCC and Clang from autovectorizing the XXH32 loop
+     * (pragmas and attributes don't work for some resason) without globally
+     * disabling SSE4.1.
+     *
+     * The reason we want to avoid vectorization is because despite working on
+     * 4 integers at a time, there are multiple factors slowing XXH32 down on
+     * SSE4:
+     * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on newer chips!)
+     *   making it slightly slower to multiply four integers at once compared to four
+     *   integers independently. Even when pmulld was fastest, Sandy/Ivy Bridge, it is
+     *   still not worth it to go into SSE just to multiply unless doing a long operation.
+     *
+     * - Four instructions are required to rotate,
+     *      movqda tmp,  v // not required with VEX encoding
+     *      pslld  tmp, 13 // tmp <<= 13
+     *      psrld  v,   19 // x >>= 19
+     *      por    v,  tmp // x |= tmp
+     *   compared to one for scalar:
+     *      roll   v, 13    // reliably fast across the board
+     *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
+     *
+     * - Instruction level parallelism is actually more beneficial here because the
+     *   SIMD actually serializes this operation: While v1 is rotating, v2 can load data,
+     *   while v3 can multiply. SSE forces them to operate together.
+     *
+     * How this hack works:
+     * __asm__(""       // Declare an assembly block but don't declare any instructions
+     *          :       // However, as an Input/Output Operand,
+     *          "+r"    // constrain a read/write operand (+) as a general purpose register (r).
+     *          (acc)   // and set acc as the operand
+     * );
+     *
+     * Because of the 'r', the compiler has promised that seed will be in a
+     * general purpose register and the '+' says that it will be 'read/write',
+     * so it has to assume it has changed. It is like volatile without all the
+     * loads and stores.
+     *
+     * Since the argument has to be in a normal register (not an SSE register),
+     * each time XXH32_round is called, it is impossible to vectorize. */
+    __asm__("" : "+r" (acc));
+#endif
+    return acc;
+}
+
+/* mix all bits */
+static xxh_u32 XXH32_avalanche(xxh_u32 h32)
+{
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+    return(h32);
+}
+
+#define XXH_get32bits(p) XXH_readLE32_align(p, align)
+
+static xxh_u32
+XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define PROCESS1               \
+    h32 += (*ptr++) * PRIME32_5; \
+    h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+
+#define PROCESS4                         \
+    h32 += XXH_get32bits(ptr) * PRIME32_3; \
+    ptr+=4;                                \
+    h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
+
+    /* Compact rerolled version */
+    if (XXH_REROLL) {
+        len &= 15;
+        while (len >= 4) {
+            PROCESS4;
+            len -= 4;
+        }
+        while (len > 0) {
+            PROCESS1;
+            --len;
+        }
+        return XXH32_avalanche(h32);
+    } else {
+         switch(len&15) /* or switch(bEnd - p) */ {
+           case 12:      PROCESS4;
+                         /* fallthrough */
+           case 8:       PROCESS4;
+                         /* fallthrough */
+           case 4:       PROCESS4;
+                         return XXH32_avalanche(h32);
+
+           case 13:      PROCESS4;
+                         /* fallthrough */
+           case 9:       PROCESS4;
+                         /* fallthrough */
+           case 5:       PROCESS4;
+                         PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 14:      PROCESS4;
+                         /* fallthrough */
+           case 10:      PROCESS4;
+                         /* fallthrough */
+           case 6:       PROCESS4;
+                         PROCESS1;
+                         PROCESS1;
+                         return XXH32_avalanche(h32);
+
+           case 15:      PROCESS4;
+                         /* fallthrough */
+           case 11:      PROCESS4;
+                         /* fallthrough */
+           case 7:       PROCESS4;
+                         /* fallthrough */
+           case 3:       PROCESS1;
+                         /* fallthrough */
+           case 2:       PROCESS1;
+                         /* fallthrough */
+           case 1:       PROCESS1;
+                         /* fallthrough */
+           case 0:       return XXH32_avalanche(h32);
+        }
+        XXH_ASSERT(0);
+        return h32;   /* reaching this point is deemed impossible */
+    }
+}
+
+XXH_FORCE_INLINE xxh_u32
+XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
+{
+    const xxh_u8* bEnd = input + len;
+    xxh_u32 h32;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (input==NULL) {
+        len=0;
+        bEnd=input=(const xxh_u8*)(size_t)16;
+    }
+#endif
+
+    if (len>=16) {
+        const xxh_u8* const limit = bEnd - 15;
+        xxh_u32 v1 = seed + PRIME32_1 + PRIME32_2;
+        xxh_u32 v2 = seed + PRIME32_2;
+        xxh_u32 v3 = seed + 0;
+        xxh_u32 v4 = seed - PRIME32_1;
+
+        do {
+            v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4;
+            v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4;
+            v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4;
+            v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4;
+        } while (input < limit);
+
+        h32 = XXH_rotl32(v1, 1)  + XXH_rotl32(v2, 7)
+            + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    } else {
+        h32  = seed + PRIME32_5;
+    }
+
+    h32 += (xxh_u32)len;
+
+    return XXH32_finalize(h32, input, len&15, align);
+}
+
+
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_state_t state;
+    XXH32_reset(&state, seed);
+    XXH32_update(&state, (const xxh_u8*)input, len);
+    return XXH32_digest(&state);
+
+#else
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+#endif
+}
+
+
+
+/*******   Hash streaming   *******/
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
+{
+    XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + PRIME32_1 + PRIME32_2;
+    state.v2 = seed + PRIME32_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME32_1;
+    /* do not write into reserved, planned to be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH_errorcode
+XXH32_update(XXH32_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len_32 += (XXH32_hash_t)len;
+        state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));
+
+        if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len);
+            state->memsize += (XXH32_hash_t)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* some data left from previous update */
+            XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
+            {   const xxh_u32* p32 = state->mem32;
+                state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++;
+                state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++;
+                state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++;
+                state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
+            }
+            p += 16-state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p <= bEnd-16) {
+            const xxh_u8* const limit = bEnd - 16;
+            xxh_u32 v1 = state->v1;
+            xxh_u32 v2 = state->v2;
+            xxh_u32 v3 = state->v3;
+            xxh_u32 v4 = state->v4;
+
+            do {
+                v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4;
+                v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;
+                v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;
+                v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;
+            } while (p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* state)
+{
+    xxh_u32 h32;
+
+    if (state->large_len) {
+        h32 = XXH_rotl32(state->v1, 1)
+            + XXH_rotl32(state->v2, 7)
+            + XXH_rotl32(state->v3, 12)
+            + XXH_rotl32(state->v4, 18);
+    } else {
+        h32 = state->v3 /* == seed */ + PRIME32_5;
+    }
+
+    h32 += state->total_len_32;
+
+    return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
+}
+
+
+/*******   Canonical representation   *******/
+
+/*! Default XXH result types are basic unsigned 32 and 64 bits.
+*   The canonical representation follows human-readable write convention, aka big-endian (large digits first).
+*   These functions allow transformation of hash result into and from its canonical format.
+*   This way, hash values can be written into a file or buffer, remaining comparable across different systems.
+*/
+
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+    return XXH_readBE32(src);
+}
+
+
+#ifndef XXH_NO_LONG_LONG
+
+/* *******************************************************************
+*  64-bit hash functions
+*********************************************************************/
+
+/*******   Memory access   *******/
+
+typedef XXH64_hash_t xxh_u64;
+
+
+/*! XXH_REROLL_XXH64:
+ * Whether to reroll the XXH64_finalize() loop.
+ *
+ * Just like XXH32, we can unroll the XXH64_finalize() loop. This can be a performance gain
+ * on 64-bit hosts, as only one jump is required.
+ *
+ * However, on 32-bit hosts, because arithmetic needs to be done with two 32-bit registers,
+ * and 64-bit arithmetic needs to be simulated, it isn't beneficial to unroll. The code becomes
+ * ridiculously large (the largest function in the binary on i386!), and rerolling it saves
+ * anywhere from 3kB to 20kB. It is also slightly faster because it fits into cache better
+ * and is more likely to be inlined by the compiler.
+ *
+ * If XXH_REROLL is defined, this is ignored and the loop is always rerolled. */
+#ifndef XXH_REROLL_XXH64
+#  if (defined(__ILP32__) || defined(_ILP32)) /* ILP32 is often defined on 32-bit GCC family */ \
+   || !(defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) /* x86-64 */ \
+     || defined(_M_ARM64) || defined(__aarch64__) || defined(__arm64__) /* aarch64 */ \
+     || defined(__PPC64__) || defined(__PPC64LE__) || defined(__ppc64__) || defined(__powerpc64__) /* ppc64 */ \
+     || defined(__mips64__) || defined(__mips64)) /* mips64 */ \
+   || (!defined(SIZE_MAX) || SIZE_MAX < ULLONG_MAX) /* check limits */
+#    define XXH_REROLL_XXH64 1
+#  else
+#    define XXH_REROLL_XXH64 0
+#  endif
+#endif /* !defined(XXH_REROLL_XXH64) */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u64 XXH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
+static xxh_u64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    xxh_u64 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap64 _byteswap_uint64
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap64 __builtin_bswap64
+#else
+static xxh_u64 XXH_swap64 (xxh_u64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+}
+
+static xxh_u64 XXH_readBE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH_readLE64_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return XXH_readLE64(ptr);
+    else
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
+}
+
+
+/*******   xxh64   *******/
+
+static const xxh_u64 PRIME64_1 = 0x9E3779B185EBCA87ULL;   /* 0b1001111000110111011110011011000110000101111010111100101010000111 */
+static const xxh_u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL;   /* 0b1100001010110010101011100011110100100111110101001110101101001111 */
+static const xxh_u64 PRIME64_3 = 0x165667B19E3779F9ULL;   /* 0b0001011001010110011001111011000110011110001101110111100111111001 */
+static const xxh_u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL;   /* 0b1000010111101011110010100111011111000010101100101010111001100011 */
+static const xxh_u64 PRIME64_5 = 0x27D4EB2F165667C5ULL;   /* 0b0010011111010100111010110010111100010110010101100110011111000101 */
+
+static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
+{
+    acc += input * PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= PRIME64_1;
+    return acc;
+}
+
+static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
+{
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * PRIME64_1 + PRIME64_4;
+    return acc;
+}
+
+static xxh_u64 XXH64_avalanche(xxh_u64 h64)
+{
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+    return h64;
+}
+
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, align)
+
+static xxh_u64
+XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define PROCESS1_64            \
+    h64 ^= (*ptr++) * PRIME64_5; \
+    h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+
+#define PROCESS4_64          \
+    h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * PRIME64_1; \
+    ptr+=4;                    \
+    h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+
+#define PROCESS8_64 {        \
+    xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); \
+    ptr+=8;                    \
+    h64 ^= k1;               \
+    h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; \
+}
+
+    /* Rerolled version for 32-bit targets is faster and much smaller. */
+    if (XXH_REROLL || XXH_REROLL_XXH64) {
+        len &= 31;
+        while (len >= 8) {
+            PROCESS8_64;
+            len -= 8;
+        }
+        if (len >= 4) {
+            PROCESS4_64;
+            len -= 4;
+        }
+        while (len > 0) {
+            PROCESS1_64;
+            --len;
+        }
+         return  XXH64_avalanche(h64);
+    } else {
+        switch(len & 31) {
+           case 24: PROCESS8_64;
+                         /* fallthrough */
+           case 16: PROCESS8_64;
+                         /* fallthrough */
+           case  8: PROCESS8_64;
+                    return XXH64_avalanche(h64);
+
+           case 28: PROCESS8_64;
+                         /* fallthrough */
+           case 20: PROCESS8_64;
+                         /* fallthrough */
+           case 12: PROCESS8_64;
+                         /* fallthrough */
+           case  4: PROCESS4_64;
+                    return XXH64_avalanche(h64);
+
+           case 25: PROCESS8_64;
+                         /* fallthrough */
+           case 17: PROCESS8_64;
+                         /* fallthrough */
+           case  9: PROCESS8_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 29: PROCESS8_64;
+                         /* fallthrough */
+           case 21: PROCESS8_64;
+                         /* fallthrough */
+           case 13: PROCESS8_64;
+                         /* fallthrough */
+           case  5: PROCESS4_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 26: PROCESS8_64;
+                         /* fallthrough */
+           case 18: PROCESS8_64;
+                         /* fallthrough */
+           case 10: PROCESS8_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 30: PROCESS8_64;
+                         /* fallthrough */
+           case 22: PROCESS8_64;
+                         /* fallthrough */
+           case 14: PROCESS8_64;
+                         /* fallthrough */
+           case  6: PROCESS4_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 27: PROCESS8_64;
+                         /* fallthrough */
+           case 19: PROCESS8_64;
+                         /* fallthrough */
+           case 11: PROCESS8_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    PROCESS1_64;
+                    return XXH64_avalanche(h64);
+
+           case 31: PROCESS8_64;
+                         /* fallthrough */
+           case 23: PROCESS8_64;
+                         /* fallthrough */
+           case 15: PROCESS8_64;
+                         /* fallthrough */
+           case  7: PROCESS4_64;
+                         /* fallthrough */
+           case  3: PROCESS1_64;
+                         /* fallthrough */
+           case  2: PROCESS1_64;
+                         /* fallthrough */
+           case  1: PROCESS1_64;
+                         /* fallthrough */
+           case  0: return XXH64_avalanche(h64);
+        }
+    }
+    /* impossible to reach */
+    XXH_ASSERT(0);
+    return 0;  /* unreachable, but some compilers complain without it */
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
+{
+    const xxh_u8* bEnd = input + len;
+    xxh_u64 h64;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (input==NULL) {
+        len=0;
+        bEnd=input=(const xxh_u8*)(size_t)32;
+    }
+#endif
+
+    if (len>=32) {
+        const xxh_u8* const limit = bEnd - 32;
+        xxh_u64 v1 = seed + PRIME64_1 + PRIME64_2;
+        xxh_u64 v2 = seed + PRIME64_2;
+        xxh_u64 v3 = seed + 0;
+        xxh_u64 v4 = seed - PRIME64_1;
+
+        do {
+            v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8;
+            v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
+            v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
+            v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
+        } while (input<=limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+
+    } else {
+        h64  = seed + PRIME64_5;
+    }
+
+    h64 += (xxh_u64) len;
+
+    return XXH64_finalize(h64, input, len, align);
+}
+
+
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, (const xxh_u8*)input, len);
+    return XXH64_digest(&state);
+
+#else
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+
+#endif
+}
+
+/*******   Hash Streaming   *******/
+
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
+{
+    XXH64_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + PRIME64_1 + PRIME64_2;
+    state.v2 = seed + PRIME64_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME64_1;
+     /* do not write into reserved64, might be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode
+XXH64_update (XXH64_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const xxh_u8* p = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = p + len;
+
+        state->total_len += len;
+
+        if (state->memsize + len < 32) {  /* fill in tmp buffer */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len);
+            state->memsize += (xxh_u32)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* tmp buffer is full */
+            XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
+            state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0));
+            state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1));
+            state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2));
+            state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3));
+            p += 32-state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p+32 <= bEnd) {
+            const xxh_u8* const limit = bEnd - 32;
+            xxh_u64 v1 = state->v1;
+            xxh_u64 v2 = state->v2;
+            xxh_u64 v3 = state->v3;
+            xxh_u64 v4 = state->v4;
+
+            do {
+                v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8;
+                v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;
+                v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;
+                v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;
+            } while (p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* state)
+{
+    xxh_u64 h64;
+
+    if (state->total_len >= 32) {
+        xxh_u64 const v1 = state->v1;
+        xxh_u64 const v2 = state->v2;
+        xxh_u64 const v3 = state->v3;
+        xxh_u64 const v4 = state->v4;
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+    } else {
+        h64  = state->v3 /*seed*/ + PRIME64_5;
+    }
+
+    h64 += (xxh_u64) state->total_len;
+
+    return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
+}
+
+
+/******* Canonical representation   *******/
+
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
+}
+
+
+
+/* *********************************************************************
+*  XXH3
+*  New generation hash designed for speed on small keys and vectorization
+************************************************************************ */
+
+#include "xxh3.h"
+
+
+#endif  /* XXH_NO_LONG_LONG */
+
+
+#endif  /* XXH_IMPLEMENTATION */
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c7299e9..c29a779 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,7 +1,6 @@
 set(HAVE_MPI 0 CACHE BOOL "Have MPI")
 set(HAVE_AVX2 0 CACHE BOOL "Have AVX2")
 set(HAVE_SSE4_1 0 CACHE BOOL "Have SSE4.1")
-set(HAVE_NEON 0 CACHE BOOL "Have NEON")
 set(HAVE_TESTS 0 CACHE BOOL "Have Tests")
 set(HAVE_SHELLCHECK 1 CACHE BOOL "Have ShellCheck")
 set(HAVE_GPROF 0 CACHE BOOL "Have GPROF Profiler")
@@ -68,15 +67,19 @@ if (CMAKE_BUILD_TYPE MATCHES RELEASE OR CMAKE_BUILD_TYPE MATCHES RELWITHDEBINFO)
     endif ()
 endif ()
 
-append_target_property(mmseqs-framework COMPILE_FLAGS ${MMSEQS_CXX_FLAGS} -fno-exceptions -pedantic -Wall -Wextra -Wdisabled-optimization)
-append_target_property(mmseqs-framework LINK_FLAGS ${MMSEQS_CXX_FLAGS} -fno-exceptions -pedantic -Wall -Wextra -Wdisabled-optimization)
+append_target_property(mmseqs-framework COMPILE_FLAGS ${MMSEQS_CXX_FLAGS} -pedantic -Wall -Wextra -Wdisabled-optimization)
+append_target_property(mmseqs-framework LINK_FLAGS ${MMSEQS_CXX_FLAGS} -pedantic -Wall -Wextra -Wdisabled-optimization)
+if (NOT EMSCRIPTEN)
+    append_target_property(mmseqs-framework COMPILE_FLAGS -fno-exceptions)
+    append_target_property(mmseqs-framework LINK_FLAGS -fno-exceptions)
+endif()
 
 if (ENABLE_WERROR)
     append_target_property(mmseqs-framework COMPILE_FLAGS -Werror)
     append_target_property(mmseqs-framework LINK_FLAGS -Werror)
 endif()
 
-if (CYGWIN OR ${HAVE_NEON})
+if (CYGWIN OR ARM)
     target_compile_definitions(mmseqs-framework PUBLIC -D_GNU_SOURCE=1)
 endif ()
 
@@ -118,7 +121,8 @@ if (HAVE_POSIX_MADVISE)
 endif ()
 
 # SIMD instruction sets support
-if (HAVE_AVX2)
+if (ARM OR PPC64 OR EMSCRIPTEN)
+elseif (HAVE_AVX2)
     target_compile_definitions(mmseqs-framework PUBLIC -DAVX2=1)
     if (CMAKE_COMPILER_IS_CLANG)
         append_target_property(mmseqs-framework COMPILE_FLAGS -mavx2)
@@ -131,8 +135,6 @@ elseif (HAVE_SSE4_1)
     target_compile_definitions(mmseqs-framework PUBLIC -DSSE=1)
     append_target_property(mmseqs-framework COMPILE_FLAGS -msse4.1)
     append_target_property(mmseqs-framework LINK_FLAGS -msse4.1)
-elseif (HAVE_NEON)
-    target_compile_definitions(mmseqs-framework PUBLIC -DSSE=1 -DNEON=1)
 else ()
     include(CheckSSEFeatures)
     append_target_property(mmseqs-framework COMPILE_FLAGS ${SSE_FLAGS})
@@ -150,9 +152,16 @@ else ()
     endif (HAVE_AVX2_EXTENSIONS)
 endif ()
 
-# tinyexpr
-target_link_libraries(mmseqs-framework tinyexpr)
+target_link_libraries(mmseqs-framework tinyexpr libzstd_static microtar)
+if (CYGWIN)
+    target_link_libraries(mmseqs-framework nedmalloc)
+endif ()
 
+if (EMSCRIPTEN)
+    target_compile_definitions(mmseqs-framework PUBLIC -DHAVE_ZLIB=1 -DHAVE_BZLIB=1)
+    append_target_property(mmseqs-framework COMPILE_FLAGS -s USE_ZLIB=1 -s USE_BZIP2=1)
+    append_target_property(mmseqs-framework LINK_FLAGS -s USE_ZLIB=1 -s USE_BZIP2=1)
+else ()
 find_package(ZLIB QUIET)
 if (ZLIB_FOUND)
     message("-- Found ZLIB")
@@ -208,6 +217,7 @@ if (BZIP2_FOUND)
 else ()
     message("-- Could not find BZLIB")
 endif ()
+endif ()
 
 # MPI
 if (HAVE_MPI)
diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h
index e0e001a..b193e1a 100644
--- a/src/CommandDeclarations.h
+++ b/src/CommandDeclarations.h
@@ -30,10 +30,12 @@ extern int view(int argc, const char **argv, const Command& command);
 extern int rmdb(int argc, const char **argv, const Command& command);
 extern int mvdb(int argc, const char **argv, const Command& command);
 extern int createtsv(int argc, const char **argv, const Command& command);
+extern int databases(int argc, const char **argv, const Command& command);
 extern int dbtype(int argc, const char **argv, const Command& command);
 extern int decompress(int argc, const char **argv, const Command &command);
 extern int diffseqdbs(int argc, const char **argv, const Command& command);
 extern int easycluster(int argc, const char **argv, const Command& command);
+extern int easyrbh(int argc, const char **argv, const Command& command);
 extern int easylinclust(int argc, const char **argv, const Command& command);
 extern int easysearch(int argc, const char **argv, const Command& command);
 extern int easylinsearch(int argc, const char **argv, const Command& command);
@@ -77,6 +79,7 @@ extern int ungappedprefilter(int argc, const char **argv, const Command& command
 extern int rbh(int argc, const char **argv, const Command& command);
 extern int result2flat(int argc, const char **argv, const Command& command);
 extern int result2msa(int argc, const char **argv, const Command& command);
+extern int result2dnamsa(int argc, const char **argv, const Command& command);
 extern int result2pp(int argc, const char **argv, const Command& command);
 extern int result2profile(int argc, const char **argv, const Command& command);
 extern int result2rbh(int argc, const char **argv, const Command& command);
@@ -102,8 +105,11 @@ extern int createtaxdb(int argc, const char **argv, const Command& command);
 extern int translateaa(int argc, const char **argv, const Command& command);
 extern int translatenucs(int argc, const char **argv, const Command& command);
 extern int tsv2db(int argc, const char **argv, const Command& command);
+extern int tar2db(int argc, const char **argv, const Command& command);
 extern int versionstring(int argc, const char **argv, const Command& command);
 extern int addtaxonomy(int argc, const char **argv, const Command& command);
 extern int filtertaxdb(int argc, const char **argv, const Command& command);
+extern int filtertaxseqdb(int argc, const char **argv, const Command& command);
+extern int aggregatetax(int argc, const char **argv, const Command& command);
 extern int diskspaceavail(int argc, const char **argv, const Command& command);
 #endif
diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp
index 8797c46..3da5b08 100644
--- a/src/MMseqsBase.cpp
+++ b/src/MMseqsBase.cpp
@@ -4,809 +4,1089 @@
 
 Parameters& par = Parameters::getInstance();
 std::vector<Command> baseCommands = {
-// Main tools (for non-experts)
-        {"createdb",             createdb,             &par.createdb,             COMMAND_MAIN,
-                "Convert protein sequence set in a FASTA file to MMseqs sequence DB format",
-                "converts a protein sequence flat/gzipped FASTA or FASTQ file to the MMseqs sequence DB format. This format is needed as input to mmseqs search, cluster and many other tools.",
-                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:fastaFile1[.gz]> ... <i:fastaFileN[.gz]> <o:sequenceDB>",
-                CITATION_MMSEQS2, {{"fast[a|q]File[.gz|bz]",  DbType::ACCESS_MODE_INPUT,  DbType::NEED_DATA | DbType::VARIADIC,  &DbValidator::flatfile },
-                                   {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }}},
         {"easy-search",          easysearch,           &par.easysearchworkflow,   COMMAND_EASY,
-                "Search with a query fasta against target fasta (or database) and return a BLAST-compatible result in a single step",
-                "Searches with a sequence FASTA file through the target sequence FASTA file or DB by in a single step. This combines createdb, search, summarizeresults, convert and convertalis modules into a single workflow.",
+                "Sensitive homology search",
+                "# Search multiple FASTA against FASTA (like BLASTP, TBLASTN, BLASTX, BLASTN --search-type 3, TBLASTX --search-type 2)\n"
+                "mmseqs easy-search examples/QUERY.fasta examples/QUERY.fasta examples/DB.fasta result.m8 tmp\n\n"
+                "# Iterative profile search from stdin (like PSI-BLAST)\n"
+                "cat examples/QUERY.fasta | mmseqs easy-search stdin examples/DB.fasta result.m8 tmp --num-iterations 2\n\n"
+                "# Profile search against small databases (e.g. PFAM, eggNOG)\n"
+                "mmseqs databases PFAM pfam_db tmp\n"
+                "mmseqs easy-search examples/QUERY.fasta pfam_db res.m8 tmp\n\n"
+                "# Exhaustive search against sequences or profiles (works for large DBs)\n"
+                "mmseqs easy-search examples/QUERY.fasta targetProfiles res.m8 tmp --slice-search\n\n"
+                "# Increasing sensitivity search (from 2 to 7 in 3 steps)\n"
+                "mmseqs easy-search examples/QUERY.fasta examples/DB.fasta result.m8 tmp --start-sens 2 -s 7 --sens-steps 3\n",
                 "Milot Mirdita <milot@mirdita.de> & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:queryFastaFile1[.gz]> ... <i:queryFastaFileN[.gz]> <i:targetFastaFile[.gz]>|<i:targetDB> <o:alignmentFile> <tmpDir>",
-                CITATION_SERVER | CITATION_MMSEQS2,{{"queryFastaFile[.gz]",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC,  &DbValidator::flatfile },
-                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile },
-                                         {"alignmentFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile  },
-                                         {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
-        {"easy-linsearch",       easylinsearch,        &par.easylinsearchworkflow,COMMAND_EASY,
-                "Linear time search with a query fasta against target fasta (or database) and return a BLAST-compatible result in a single step",
-                "Searches with a sequence FASTA file through the target sequence FASTA file or DB by in a single step. This combines createdb, linsearch, summarizeresults, convert and convertalis modules into a single workflow.",
+                "<i:queryFastaFile1[.gz|.bz2]> ... <i:queryFastaFileN[.gz|.bz2]>|<i:stdin> <i:targetFastaFile[.gz]>|<i:targetDB> <o:alignmentFile> <tmpDir>",
+                CITATION_SERVER | CITATION_MMSEQS2,{{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin },
+                                                           {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile },
+                                                           {"alignmentFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
+                                                           {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
+        {"easy-linsearch",       easylinsearch,        &par.easylinsearchworkflow,COMMAND_EASY | COMMAND_EXPERT,
+                "Fast, less sensitive homology search",
+                NULL,
                 "Milot Mirdita <milot@mirdita.de> & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:queryFastaFile1[.gz]> ... <i:queryFastaFileN[.gz]> <i:targetFastaFile[.gz]>|<i:targetDB> <o:alignmentFile> <tmpDir>",
-                CITATION_MMSEQS2|CITATION_LINCLUST, {{"queryFastaFile[.gz]",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfile },
-                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile },
-                                         {"alignmentFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile  },
-                                         {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
+                "<i:queryFastaFile1[.gz|.bz2]> ... <i:queryFastaFileN[.gz|.bz2]> <i:targetFastaFile[.gz|.bz2]>|<i:targetDB> <o:alignmentFile> <tmpDir>",
+                CITATION_MMSEQS2|CITATION_LINCLUST, {{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin },
+                                                           {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile },
+                                                           {"alignmentFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
+                                                           {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
+        {"easy-cluster",         easycluster,          &par.easyclusterworkflow, COMMAND_EASY,
+                "Slower, sensitive clustering",
+                "mmseqs easy-cluster examples/DB.fasta result tmp\n"
+                "# Cluster output\n"
+                "#  - result_rep_seq.fasta: Representatives\n"
+                "#  - result_all_seq.fasta: FASTA-like per cluster\n"
+                "#  - result_cluster.tsv:   Adjacency list\n\n"
+                "# Important parameter: --min-seq-id, --cov-mode and -c \n"
+                "#                  --cov-mode \n"
+                "#                  0    1    2\n"
+                "# Q: MAVGTACRPA  60%  IGN  60%\n"
+                "# T: -AVGTAC---  60% 100%  IGN\n"
+                "#        -c 0.7    -    +    -\n"
+                "#        -c 0.6    +    +    +\n\n"
+                "# Cascaded clustering with reassignment\n"
+                "# - Corrects criteria-violoations of cascaded merging\n"
+                "# - Produces more clusters and is a bit slower\n"
+                "mmseqs easy-cluster examples/DB.fasta result tmp --cluster-reassign\n",
+                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]> <o:clusterPrefix> <tmpDir>",
+                CITATION_MMSEQS2|CITATION_LINCLUST, {{"queryFastaFile[.gz]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin },
+                                                           {"clusterPrefix", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
+                                                           {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
         {"easy-linclust",        easylinclust,         &par.easylinclustworkflow, COMMAND_EASY,
-                "Compute clustering of a fasta/fastq database in linear time. The workflow outputs the representative sequences, a cluster tsv and a fasta-like format containing all sequences.",
-                "Clusters sequences by similarity in linear time. It groups similar sequences together based on user-specified criteria (max. E-value, seq. id., min. coverage,...).",
-                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:queryFastaFile1[.gz]> ... <i:queryFastaFileN[.gz]> <o:clusterPrefix> <tmpDir>",
-                CITATION_MMSEQS2|CITATION_LINCLUST, {{"queryFastaFile[.gz]",  DbType::ACCESS_MODE_INPUT,  DbType::NEED_DATA|DbType::VARIADIC,  &DbValidator::flatfile },
-                                         {"clusterPrefix", DbType::ACCESS_MODE_OUTPUT,  DbType::NEED_DATA, &DbValidator::flatfile  },
-                                         {"tmpDir", DbType::ACCESS_MODE_OUTPUT,  DbType::NEED_DATA, &DbValidator::directory }}},
-        {"easy-cluster",         easycluster,          &par.easyclusterworkflow,  COMMAND_EASY,
-                "Compute clustering of a fasta database. The workflow outputs the representative sequences, a cluster tsv and a fasta-like format containing all sequences.",
-                "Clusters sequences by similarity. It compares all sequences in the sequence DB with each other using mmseqs search, filters alignments according to user-specified criteria (max. E-value, min. coverage,...),   and runs mmseqs clust to group similar sequences together into clusters.",
-                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:queryFastaFile1[.gz]> ... <i:queryFastaFileN[.gz]> <o:clusterPrefix> <tmpDir>",
-                CITATION_MMSEQS2|CITATION_LINCLUST, {{"queryFastaFile[.gz]",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC,  &DbValidator::flatfile },
-                                         {"clusterPrefix", DbType::ACCESS_MODE_OUTPUT,  DbType::NEED_DATA, &DbValidator::flatfile  },
-                                         {"tmpDir", DbType::ACCESS_MODE_OUTPUT,  DbType::NEED_DATA, &DbValidator::directory }}},
-        {"easy-taxonomy",             easytaxonomy,             &par.easytaxonomy,             COMMAND_EASY,
-                "Compute taxonomy and lowest common ancestor for each sequence. The workflow outputs a taxonomic classification for sequences and a hierarchical summery report.",
-                "Classifies sequences by alignment followed by LCA (optional). It searches all sequences in the query DB against the target DB using mmseqs search, filters alignments according to the LCA mode: 2bLCA, approx. 2bLCA, whole result or top hit. An LCA is computed from the filtered alignment result.",
-                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:queryFastaFile1[.gz]> ... <i:queryFastaFileN[.gz]> <i:targetDB> <o:taxReports> <tmpDir>",
-                CITATION_MMSEQS2, {{"queryFastaFile[.gz]",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfile },
-                                   {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER|DbType::NEED_TAXONOMY, &DbValidator::taxSequenceDb },
-                                   {"taxReports",   DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
-                                   {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
-        {"search",               search,               &par.searchworkflow,       COMMAND_MAIN,
-                "Search with query sequence or profile DB (iteratively) through target sequence DB",
-                "Searches with the sequences or profiles query DB through the target sequence DB by running the prefilter tool and the align tool for Smith-Waterman alignment. For each query a results file with sequence matches is written as entry into a database of search results (alignmentDB).\nIn iterative profile search mode, the detected sequences satisfying user-specified criteria are aligned to the query MSA, and the resulting query profile is used for the next search iteration. Iterative profile searches are usually much more sensitive than (and at least as sensitive as) searches with single query sequences.",
+                "Fast linear time cluster, less sensitive clustering",
+                "mmseqs easy-linclust examples/DB.fasta result tmp\n\n"
+                "# Linclust output\n"
+                "#  - result_rep_seq.fasta: Representatives\n"
+                "#  - result_all_seq.fasta: FASTA-like per cluster\n"
+                "#  - result_cluster.tsv:   Adjecency list\n\n"
+                "# Important parameter: --min-seq-id, --cov-mode and -c \n"
+                "#                  --cov-mode \n"
+                "#                  0    1    2\n"
+                "# Q: MAVGTACRPA  60%  IGN  60%\n"
+                "# T: -AVGTAC---  60% 100%  IGN\n"
+                "#        -c 0.7    -    +    -\n"
+                "#        -c 0.6    +    +    +\n\n"
+                "# Cluster nucleotide sequences \n"
+                "mmseqs easy-linclust examples/DB.fasta result tmp --kmer-per-seq-scale 0.3\n",
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:queryDB> <i:targetDB> <o:alignmentDB> <tmpDir>",
-                CITATION_MMSEQS2, {{"queryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb  },
-                                         {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
-        {"linsearch",               linsearch,               &par.linsearchworkflow,       COMMAND_MAIN,
-                "Search with query sequence  DB through target sequence DB",
-                "Searches with the sequences query DB through the target sequence DB by running the kmersearch tool and the align tool for Smith-Waterman alignment. For each query a results file with sequence matches is written as entry into a database of search results (alignmentDB).\n",
+                "<i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]> <o:clusterPrefix> <tmpDir>",
+                CITATION_MMSEQS2|CITATION_LINCLUST, {{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin },
+                                                            {"clusterPrefix", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
+                                                            {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
+        {"easy-taxonomy",        easytaxonomy,         &par.easytaxonomy,         COMMAND_EASY,
+                "Taxonomic classification",
+                "# Assign taxonomic labels to FASTA sequences\n"
+                "  - result_tophit_aln: top hits\n"
+                "  - result_tophit_report: coverage profiles per database entry\n"
+                "  - result_report: kraken style report\n"
+                "# Download a sequence database with taxonomy information\n"
+                "mmseqs databases UniProtKB/Swiss-Prot swissprotDB tmp\n\n"
+                "# Assign taxonomy based on top hit\n"
+                "mmseqs easy-taxonomy examples/DB.fasta swissprotDB result tmp\n\n"
+                "# Assign taxonomy based on 2bLCA\n"
+                "mmseqs easy-taxonomy examples/DB.fasta swissprotDB result tmp --lca-mode 2\n",
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:queryDB> <i:targetDB> <o:alignmentDB> <tmpDir>",
-                CITATION_MMSEQS2, {{"queryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
-                                         {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
-        {"map",                  map,                  &par.mapworkflow,          COMMAND_MAIN,
-                "Fast ungapped mapping of query sequences to target sequences.",
-                "Finds very similar sequence matches in a sequence database. First calls the prefilter module (with a low sensitivity setting) to detect high scoring diagonals and then computes an ungapped alignment with the rescorediagonal module. In contrast to the normal search, for maximum speed no gapped alignment is computed, query sequences are not masked for low complexity regions and no compositional bias correction is applied.",
-                "Milot Mirdita <milot@mirdita.de> & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:queryDB> <i:targetDB> <o:alignmentDB> <tmpDir>",
-                CITATION_PLASS|CITATION_MMSEQS2, {{"queryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
-                                         {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
-        {"cluster",              clusteringworkflow,   &par.clusterworkflow,      COMMAND_MAIN,
-                "Compute clustering of a sequence DB (quadratic time)",
-                "Clusters sequences by similarity. It compares all sequences in the sequence DB with each other using mmseqs search, filters alignments according to user-specified criteria (max. E-value, min. coverage,...),   and runs mmseqs clust to group similar sequences together into clusters.",
-                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de> & Lars von den Driesch",
-                "<i:sequenceDB> <o:clusterDB> <tmpDir>",
-                CITATION_LINCLUST|CITATION_MMSEQS1|CITATION_MMSEQS2, {{"sequenceDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA,  &DbValidator::sequenceDb },
-                                         {"clusterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::clusterDb },
-                                         {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
-        {"linclust",          linclust,          &par.linclustworkflow,           COMMAND_MAIN,
-                "Cluster sequences of >30% sequence identity *in linear time*",
-                "Detects redundant sequences based on reduced alphabet and k-mer sorting.",
-                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de> ",
-                "<i:sequenceDB> <o:clusterDB> <tmpDir>",
-                CITATION_MMSEQS2|CITATION_LINCLUST, {{"sequenceDB",  DbType::ACCESS_MODE_INPUT,  DbType::NEED_DATA,  &DbValidator::sequenceDb },
-                                         {"clusterDB", DbType::ACCESS_MODE_OUTPUT,  DbType::NEED_DATA, &DbValidator::clusterDb  },
-                                         {"tmpDir", DbType::ACCESS_MODE_OUTPUT,  DbType::NEED_DATA, &DbValidator::directory }}},
-        {"indexdb",          indexdb,          &par.indexdb,                      COMMAND_HIDDEN,
-                "Precompute index table of sequence DB for faster searches",
-                "Precomputes an index table for the sequence DB. Handing over the precomputed index table as input to mmseqs search or mmseqs prefilter eliminates the computational overhead of building the index table on the fly.",
+                "<i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]> <i:targetDB> <o:taxReports> <tmpDir>",
+                CITATION_MMSEQS2, {{"queryFastaFile[.gz]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin },
+                                                           {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER|DbType::NEED_TAXONOMY, &DbValidator::taxSequenceDb },
+                                                           {"taxReports",   DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
+                                                           {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
+        {"easy-rbh",                  easyrbh,                  &par.easysearchworkflow,       COMMAND_EASY,
+                "Find reciprocal best hit",
+                "# Assign reciprocal best hit\n"
+                "mmseqs easy-rbh examples/QUERY.fasta examples/DB.fasta result tmp\n\n",
+                "Eli Levy Karin & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:queryFastaFile1[.gz|.bz2]> <i:targetFastaFile[.gz|.bz2]>|<i:targetDB> <o:alignmentFile> <tmpDir>",
+                CITATION_MMSEQS2,{{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin },
+                                   {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile },
+                                   {"alignmentFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
+                                   {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
+        {"databases",            databases,            &par.databases,            COMMAND_DATABASE_CREATION,
+                "List and download databases",
+                NULL,
+                "Milot Mirdita <milot@mirdita.de>",
+                "<name> <o:sequenceDB> <tmpDir>",
+                CITATION_MMSEQS2, {{"selection", 0, DbType::ZERO_OR_ALL, &DbValidator::empty },
+                                                           {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"tmpDir",     DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
+        {"createdb",             createdb,             &par.createdb,             COMMAND_DATABASE_CREATION,
+                "Convert FASTA/Q file(s) to a sequence DB",
+                "# Create a sequence database from multiple FASTA files\n"
+                "mmseqs createdb file1.fa file2.fa.gz file3.fa sequenceDB\n\n"
+                "# Create a seqDB from stdin\n"
+                "cat seq.fasta | mmseqs createdb stdin sequenceDB\n\n"
+                "# Create a seqDB by indexing existing FASTA/Q (for single line fasta entries only)\n"
+                "mmseqs createdb seq.fasta sequenceDB --createdb-mode 1\n",
+                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]>|<i:stdin> <o:sequenceDB>",
+                CITATION_MMSEQS2, {{"fast[a|q]File[.gz|bz2]|stdin", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfileAndStdin },
+                                                           {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }}},
+        {"indexdb",              indexdb,              &par.indexdb,              COMMAND_HIDDEN,
+                NULL,
+                NULL,
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
                 "<i:sequenceDB> <o:sequenceIndexDB>",
-                CITATION_MMSEQS2, {{"sequenceDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
-                                         {"sequenceIndexDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
-        {"createindex",          createindex,          &par.createindex,          COMMAND_MAIN,
-                "Precompute index table of sequence DB for faster searches",
-                "Precomputes an index table for the sequence DB. Handing over the precomputed index table as input to mmseqs search or mmseqs prefilter eliminates the computational overhead of building the index table on the fly.",
+                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
+                                                           {"sequenceIndexDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
+        {"createindex",          createindex,          &par.createindex,          COMMAND_DATABASE_CREATION,
+                "Store precomputed index on disk to reduce search overhead",
+                "# Create protein sequence index\n"
+                "mmseqs createindex sequenceDB tmp\n\n"
+                "# Create TBLASTX/N index from nucleotide sequences\n"
+                "mmseqs createindex sequenceDB tmp --search-type 2\n\n"
+                "# Create BLASTN index from nucleotide sequences\n"
+                "mmseqs createindex sequenceDB tmp --search-type 3\n",
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
                 "<i:sequenceDB> <tmpDir>",
-                CITATION_SERVER | CITATION_MMSEQS2,{{"sequenceDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
-                                         {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
-        {"createlinindex",          createlinindex,          &par.createlinindex,          COMMAND_MAIN,
-                "Precompute index for linsearch",
-                "Precomputes a sorted kmer list for the sequence DB.",
+                CITATION_SERVER | CITATION_MMSEQS2,{{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
+                                                           {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
+        {"createlinindex",       createlinindex,       &par.createlinindex,       COMMAND_DATABASE_CREATION | COMMAND_EXPERT,
+                "Create linsearch index",
+                NULL,
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
                 "<i:sequenceDB> <tmpDir>",
-                CITATION_SERVER | CITATION_MMSEQS2, {{"sequenceDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
-                                         {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
-        {"enrich",                enrich,              &par.enrichworkflow,       COMMAND_MAIN,
-                "Enrich a query set by searching iteratively through a profile sequence set.",
-                "",
+                CITATION_SERVER | CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
+                                                           {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
+        {"convertmsa",           convertmsa,           &par.convertmsa,           COMMAND_DATABASE_CREATION,
+                "Convert Stockholm/PFAM MSA file to a MSA DB",
+                NULL,
+                "Milot Mirdita <milot@mirdita.de>",
+                "<i:msaFile.sto[.gz]> <o:msaDB>",
+                CITATION_SERVER |CITATION_MMSEQS2, {{"msaFile.sto[.gz]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile },
+                                                           {"msaDB",DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::msaDb }}},
+        {"tsv2db",               tsv2db,               &par.tsv2db,               COMMAND_DATABASE_CREATION | COMMAND_EXPERT,
+                "Convert a TSV file to any DB",
+                NULL,
+                "Milot Mirdita <milot@mirdita.de>",
+                "<i:tsvFile> <o:resultDB>",
+                CITATION_MMSEQS2, {{"tsvFile", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile },
+                                          {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb }}},
+        {"tar2db",               tar2db,               &par.tar2db,               COMMAND_DATABASE_CREATION | COMMAND_EXPERT,
+                "Convert content of tar archives to any DB",
+                "# Assuming tar archive containing three aligned FASTA files:\n"
+                "#  * folder/msa1.fa.gz  * folder/msa2.fa  * folder/msa3.fa\n"
+                "# Create a msaDB with three DB entries each containing a separate MSA\n"
+                "mmseqs tar2db archive.tar.gz msaDB --output-dbtype 11\n",
                 "Milot Mirdita <milot@mirdita.de>",
+                "<i:tar[.gz]> ... <i:tar[.gz]> <o:resultDB>",
+                CITATION_MMSEQS2, {{".tar[.gz]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile },
+                                          {"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }}},
+
+
+        {"search",               search,               &par.searchworkflow,       COMMAND_MAIN,
+                "Sensitive homology search",
+                "# Search multiple FASTA against FASTA (like BLASTP, TBLASTN, BLASTX, BLASTN --search-type 3, TBLASTX --search-type 2)\n"
+                "mmseqs search queryDB targetDB resultDB tmp\n"
+                "mmseqs convertalis queryDB targetDB resultDB result.m8\n\n"
+                "# Iterative profile search (like PSI-BLAST)\n"
+                "mmseqs search queryDB targetDB resultDB tmp --num-iterations 2\n\n"
+                "# Profile search against small databases (e.g. PFAM, eggNOG)\n"
+                "mmseqs databases PFAM pfam_db tmp\n"
+                "mmseqs search queryDB pfam_db resultDB tmp\n\n"
+                "# Exhaustive search against sequences or profiles (works for large DBs)\n"
+                "mmseqs search queryDB targetDB resultDB tmp --slice-search\n\n"
+                "# Increasing sensitivity search (from 2 to 7 in 3 steps)\n"
+                "mmseqs search queryDB targetDB resultDB --start-sens 2 -s 7 --sens-steps 3\n",
+                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
                 "<i:queryDB> <i:targetDB> <o:alignmentDB> <tmpDir>",
-                CITATION_MMSEQS2, {{"queryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb  },
-                                         {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
+                CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                                           {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
+        {"linsearch",            linsearch,            &par.linsearchworkflow,    COMMAND_MAIN|COMMAND_EXPERT,
+                "Fast, less sensitive homology search",
+                NULL,
+                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:queryDB> <i:targetDB> <o:alignmentDB> <tmpDir>",
+                CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                                           {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
+        {"map",                  map,                  &par.mapworkflow,          COMMAND_MAIN,
+                "Map nearly identical sequences",
+                NULL,
+                "Milot Mirdita <milot@mirdita.de> & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:queryDB> <i:targetDB> <o:alignmentDB> <tmpDir>",
+                CITATION_PLASS|CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                                           {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
         {"rbh",                  rbh,                  &par.searchworkflow,       COMMAND_MAIN,
-                "Find reciprocal best hits between query and target",
-                "",
+                "Reciprocal best hit search",
+                NULL,
                 "Eli Levy Karin",
                 "<i:queryDB> <i:targetDB> <o:alignmentDB> <tmpDir>",
-                CITATION_MMSEQS2, {{"queryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb  },
-                                         {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
-// Utility tools for format conversions
-        {"createtsv",            createtsv,            &par.createtsv,            COMMAND_FORMAT_CONVERSION,
-                "Create tab-separated flat file from prefilter DB, alignment DB, cluster DB, or taxa DB",
-                "Create tab-separated flat file from prefilter DB, alignment DB, cluster DB, or taxa DB. The target database is optional. This is useful for taxa DB, since it does not have a target key.",
+                CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                          {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
+        {"linclust",          linclust,          &par.linclustworkflow,           COMMAND_MAIN,
+                "Fast, less sensitive clustering",
+                "# Linear-time clustering of FASTA file\n"
+                "mmseqs linclust sequenceDB clusterDB tmp\n\n"
+                "                   --cov-mode \n"
+                "# Sequence         0    1    2\n"
+                "# Q: MAVGTACRPA  60%  IGN  60%\n"
+                "# T: -AVGTAC---  60% 100%  IGN\n"
+                "# Cutoff -c 0.7    -    +    -\n"
+                "#        -c 0.6    +    +    +\n\n"
+                "# Cluster nucleotide sequences \n"
+                "mmseqs easy-linclust nucl.fasta result tmp --kmer-per-seq-scale 0.3\n",
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:queryDB> [<i:targetDB>] <i:resultDB> <o:tsvFile>",
-                CITATION_MMSEQS2,{{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}},
+                "<i:sequenceDB> <o:clusterDB> <tmpDir>",
+                CITATION_MMSEQS2|CITATION_LINCLUST, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"clusterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::clusterDb },
+                                                           {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
+        {"cluster",              clusteringworkflow,   &par.clusterworkflow,      COMMAND_MAIN,
+                "Slower, sensitive clustering",
+                "# Cascaded clustering of FASTA file\n"
+                "mmseqs cluster sequenceDB clusterDB tmp\n\n"
+                "#                  --cov-mode \n"
+                "# Sequence         0    1    2\n"
+                "# Q: MAVGTACRPA  60%  IGN  60%\n"
+                "# T: -AVGTAC---  60% 100%  IGN\n"
+                "# Cutoff -c 0.7    -    +    -\n"
+                "#        -c 0.6    +    +    +\n\n"
+                "# Cascaded clustering with reassignment\n"
+                "# - Corrects criteria-violoations of cascaded merging\n"
+                "# - Produces more clusters and is a bit slower\n"
+                "mmseqs cluster sequenceDB clusterDB tmp --cluster-reassign\n",
+                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de> & Lars von den Driesch",
+                "<i:sequenceDB> <o:clusterDB> <tmpDir>",
+                CITATION_LINCLUST|CITATION_MMSEQS1|CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"clusterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::clusterDb },
+                                                           {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
+        {"clusterupdate",        clusterupdate,        &par.clusterUpdate,        COMMAND_MAIN,
+                "Update previous clustering with new sequences",
+                "# Update clustering workflow \n"
+                "# Perform initial clustering of 5000 sequences\n"
+                "mmseqs createdb <(head -n 10000 examples/DB.fasta) sequenceDB\n"
+                "mmseqs cluster sequenceDB clusterDB tmp\n\n"
+                "# Use-case 1: Update by only adding sequences\n"
+                "mmseqs createdb examples/QUERY.fasta addedSequenceDB\n"
+                "mmseqs concatdbs sequenceDB addedSequenceDB allSequenceDB\n"
+                "mmseqs concatdbs sequenceDB_h addedSequenceDB_h allSequenceDB_h\n"
+                "mmseqs clusterupdate sequenceDB allSequenceDB clusterDB newSequenceDB newClusterDB tmp\n\n"
+                "# Use-case 2: Update clustering with deletions)\n"
+                "# Create a FASTA file missing 500 of the original sequences and 2500 new ones\n"
+                "mmseqs createdb <(tail -n +1001 examples/DB.fasta | head -n 15000) updateSequenceDB\n"
+                "mmseqs clusterupdate sequenceDB updateSequenceDB clusterDB newSequenceDB newClusterDB tmp\n",
+                "Clovis Galiez & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:oldSequenceDB> <i:newSequenceDB> <i:oldClustResultDB> <o:newMappedSequenceDB> <o:newClustResultDB> <tmpDir>",
+                CITATION_MMSEQS2|CITATION_MMSEQS1,{{"oldSequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER|DbType::NEED_LOOKUP, &DbValidator::sequenceDb },
+                                                          {"newSequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER|DbType::NEED_LOOKUP, &DbValidator::sequenceDb },
+                                                          {"oldClustResultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::clusterDb },
+                                                          {"newMappedSequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb},
+                                                          {"newClustResultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::clusterDb},
+                                                          {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}},
+        {"taxonomy",             taxonomy,             &par.taxonomy,             COMMAND_MAIN,
+                "Taxonomic classification",
+                "# Download a sequence database with taxonomy information\n"
+                "mmseqs databases UniProtKB/Swiss-Prot swissprotDB tmp\n\n"
+                "# Assign taxonomy based on top hit\n"
+                "mmseqs taxonomy queryDB swissprotDB result tmp\n\n"
+                "# Assign taxonomy based on 2bLCA\n"
+                "mmseqs taxonomy queryDB swissprotDB result tmp --lca-mode 2\n\n"
+                "# Create a Krona report\n"
+                "mmseqs taxonomyreport swissprotDB result report.html --report-mode 1\n",
+                "Milot Mirdita <milot@mirdita.de> & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:queryDB> <i:targetDB> <o:taxaDB> <tmpDir>",
+                CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                          {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_TAXONOMY, &DbValidator::taxSequenceDb },
+                                                          {"taxaDB",   DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::taxResult },
+                                                          {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
+
+
+
         {"convertalis",          convertalignments,    &par.convertalignments,    COMMAND_FORMAT_CONVERSION,
-                "Convert alignment DB to BLAST-tab format or specified custom-column output format",
-                NULL,
+                "Convert alignment DB to BLAST-tab, SAM or custom format",
+                "# Create output in BLAST M8 format (12 columns):\n"
+                "#  (1,2) identifiers for query and target sequences/profiles,\n"
+                "#  (3) sequence identity, (4) alignment length, (5) number of mismatches,\n"
+                "#  (6) number of gap openings, (7-8, 9-10) alignment start and end-position in query and in target,\n"
+                "#  (11) E-value, and (12) bit score\n"
+                "mmseqs convertalis queryDB targetDB result.m8\n\n"
+                "# Create a TSV containing pairwise alignments\n"
+                "mmseqs convertalis queryDB targetDB result.tsv --format-output query,target,qaln,taln\n\n"
+                "# Annotate a alignment result with taxonomy information from targetDB\n"
+                "mmseqs convertalis queryDB targetDB result.tsv --format-output query,target,taxid,taxname,taxlineage\n\n"
+                " Create SAM output\n"
+                "mmseqs convertalis queryDB targetDB result.sam --format-mode 1\n\n"
+                "# Create a TSV containing which query file a result comes from\n"
+                "mmseqs createdb euk_queries.fasta bac_queries.fasta queryDB\n"
+                "mmseqs convertalis queryDB targetDB result.tsv --format-output qset,query,target\n",
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
                 "<i:queryDb> <i:targetDb> <i:alignmentDB> <o:alignmentFile>",
-                CITATION_MMSEQS2, {{"queryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
-                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
-                                         {"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA,  &DbValidator::alignmentDb },
-                                         {"alignmentFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
-        {"convertprofiledb",     convertprofiledb,     &par.convertprofiledb,     COMMAND_FORMAT_CONVERSION,
-                "Convert ffindex DB of HMM files to profile DB",
+                CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
+                                          {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
+                                          {"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                          {"alignmentFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
+        {"createtsv",            createtsv,            &par.createtsv,            COMMAND_FORMAT_CONVERSION,
+                "Convert result DB to tab-separated flat file",
                 NULL,
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:ffindexProfileDB> <o:profileDB>",
+                "<i:queryDB> [<i:targetDB>] <i:resultDB> <o:tsvFile>",
                 CITATION_MMSEQS2,{{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}},
         {"convert2fasta",        convert2fasta,        &par.convert2fasta,        COMMAND_FORMAT_CONVERSION,
                 "Convert sequence DB to FASTA format",
                 NULL,
                 "Milot Mirdita <milot@mirdita.de>",
                 "<i:sequenceDB> <o:fastaFile>",
-                CITATION_MMSEQS2, {{"sequenceDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::allDb },
-                                         {"fastaFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }}},
-        {"result2flat",          result2flat,          &par.result2flat,          COMMAND_FORMAT_CONVERSION,
-                "Create a FASTA-like flat file from prefilter DB, alignment DB, or cluster DB",
+                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::allDb },
+                                                           {"fastaFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }}},
+        {"result2flat",          result2flat,          &par.result2flat,          COMMAND_FORMAT_CONVERSION | COMMAND_EXPERT,
+                "Create flat file by adding FASTA headers to DB entries",
                 NULL,
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
                 "<i:queryDB> <i:targetDB> <i:resultDB> <o:fastaDB>",
-                CITATION_MMSEQS2, {{"queryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
-                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
-                                         {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
-                                         {"fastaDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
-// Taxonomy
-        {"taxonomy",             taxonomy,             &par.taxonomy,             COMMAND_TAXONOMY,
-                "Compute taxonomy and lowest common ancestor for each sequence.",
+                CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
+                    {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
+                    {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
+                    {"fastaDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
+        {"createseqfiledb",      createseqfiledb,      &par.createseqfiledb,      COMMAND_FORMAT_CONVERSION | COMMAND_EXPERT,
+                "Create a DB of unaligned FASTA entries",
+                "# Gather all sequences from a cluster DB\n"
+                "mmseqs createseqfiledb sequenceDB clusterDB unalignedDB --min-sequences 2\n"
+                "# Build MSAs with Clustal-Omega\n"
+                "mmseqs apply unalignedDB msaDB -- clustalo -i - -o stdout --threads=1\n",
+                "Milot Mirdita <milot@mirdita.de>",
+                "<i:sequenceDB> <i:resultDB> <o:fastaDB>",
+                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                          {"fastaDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::genericDb }}},
+
+
+
+        {"createtaxdb",          createtaxdb,          &par.createtaxdb,          COMMAND_TAXONOMY,
+                "Add taxonomic labels to sequence DB",
                 NULL,
-                "Milot Mirdita <milot@mirdita.de> & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:queryDB> <i:targetDB> <o:taxaDB> <tmpDir>",
-                CITATION_MMSEQS2, {{"queryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_TAXONOMY, &DbValidator::taxSequenceDb },
-                                         {"taxaDB",   DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::taxResult },
-                                         {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
-        {"createtaxdb",          createtaxdb,          &par.createtaxdb,         COMMAND_TAXONOMY,
-                "Annotates a sequence database with NCBI taxonomy information",
-                "Annotates a sequence database with NCBI taxonomy information. The program will download the Uniprot taxMappingFile and ncbi-taxdump-folder and assign taxonmical identifier to the sequence database."
-                "An custom mapping from sequence to taxonomic identifier can be provided by the taxMappingFile.",
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
                 "<i:sequenceDB> <tmpDir>",
-                CITATION_MMSEQS2, {{"sequenceDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
-        {"addtaxonomy",          addtaxonomy,          &par.addtaxonomy, COMMAND_TAXONOMY,
-                "Add taxonomy information to result database.",
+                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
+        {"addtaxonomy",          addtaxonomy,          &par.addtaxonomy,          COMMAND_TAXONOMY | COMMAND_EXPERT,
+                "Add taxonomic labels to result DB",
                 NULL,
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
                 "<i:targetDB> <i:resultDB> <o:resultDB>",
                 CITATION_MMSEQS2, {{"targetDB", DbType::ACCESS_MODE_INPUT|DbType::NEED_TAXONOMY, DbType::NEED_DATA, &DbValidator::taxSequenceDb },
-                                         {"resultDB",   DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
-                                         {"resultDB",   DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
-        {"lca",                  lca,                  &par.lca,                  COMMAND_TAXONOMY,
-                "Compute the lowest common ancestor from a set of taxa.",
-                NULL,
-                "Milot Mirdita <milot@mirdita.de>",
-                "<i:targetDB> <i:resultDB> <o:taxaDB>",
-                CITATION_MMSEQS2, {{"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_TAXONOMY, &DbValidator::taxSequenceDb },
-                                         {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
-                                         {"taxDB",    DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::taxResult }}},
-        {"taxonomyreport",       taxonomyreport,       &par.taxonomyreport,       COMMAND_TAXONOMY,
-                "Create a taxonomy report in either Kraken or Krona mode.",
+                                                           {"resultDB",   DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
+                                                           {"resultDB",   DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
+        {"taxonomyreport",       taxonomyreport,       &par.taxonomyreport,       COMMAND_TAXONOMY | COMMAND_FORMAT_CONVERSION,
+                "Create a taxonomy report in Kraken or Krona format",
                 NULL,
                 "Milot Mirdita <milot@mirdita.de> & Florian Breitwieser <florian.bw@gmail.com>",
                 "<i:targetDB> <i:taxDB> <o:taxonomyReport>",
                 CITATION_MMSEQS2, {{"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_TAXONOMY, &DbValidator::taxSequenceDb },
-                                         {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::taxResult },
-                                         {"taxonomyReport",    DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }}},
+                                                           {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC,  &DbValidator::taxResult },
+                                                           {"taxonomyReport",    DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }}},
         {"filtertaxdb",          filtertaxdb,          &par.filtertaxdb,          COMMAND_TAXONOMY,
-                "Filter taxonomy database.",
-                NULL,
+                "Filter taxonomy result database",
+                "# Download a sequence database with taxonomy information\n"
+                "mmseqs databases UniProtKB/Swiss-Prot swissprotDB tmp\n"
+                "# Annotate a queryDB with taxonomy information\n"
+                "mmseqs taxonomy queryDB swissprotDB taxDB tmp\n\n"
+                "# Retain all unclassified hits\n"
+                "mmseqs filtertaxdb swissprotDB taxDB filteredTaxDB --taxon-list 0\n"
+                "mmseqs createsubdb <(awk '$3 == 1' filteredTaxDB.index) queryDB queryUnclassifiedDB\n\n"
+                "# Retain all eukaryotic hits except fungi\n"
+                "mmseqs filtertaxdb swissprotDB taxDB filteredTaxDB --taxon-list '2759&&!4751'\n\n"
+                "# Retain all human and chlamydia hits\n"
+                "mmseqs filtertaxdb swissprotDB taxDB filteredTaxDB --taxon-list '9606||810'\n",
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:targetDB> <i:taxaDB> <o:taxaDB>",
+                "<i:targetDB> <i:taxDB> <o:taxDB>",
                 CITATION_MMSEQS2, {{"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_TAXONOMY, &DbValidator::taxSequenceDb },
-                                         {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::taxResult },
-                                         {"taxDB",   DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::taxResult }}},
-// multi hit search
+                                                           {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::taxResult },
+                                                           {"taxDB",   DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::taxResult }}},
+        // TODO make consistent with seqTaxDB -> taxSeqDb in Wiki
+        {"filtertaxseqdb",       filtertaxseqdb,       &par.filtertaxseqdb,       COMMAND_TAXONOMY,
+                "Filter taxonomy sequence database",
+                "# Download a sequence database with taxonomy information\n"
+                "mmseqs databases UniProtKB/Swiss-Prot swissprotDB tmp\n\n"
+                "# Retain all bacterial sequences\n"
+                "mmseqs filtertaxseqdb swissprotDB swissprotDB_only_bac --taxon-list 2\n\n"
+                "# Retain all eukaryotic sequences except fungi\n"
+                "mmseqs filtertaxseqdb swissprotDB swissprotDB_euk_wo_fungi --taxon-list '2759&&!4751'\n\n"
+                "# Retain all human and chlamydia sequences\n"
+                "mmseqs filtertaxseqdb swissprotDB swissprotDB_human_and_chlamydia --taxon-list '9606||810'\n\n",
+                "Eli Levy Karin <eli.levy.karin@gmail.com> & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:taxSeqDB> <o:taxSeqDB>",
+                CITATION_MMSEQS2, {{"taxSeqDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_TAXONOMY, &DbValidator::taxSequenceDb },
+                                                           {"taxSeqDB",   DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::taxSequenceDb }}},
+        {"aggregatetax",         aggregatetax,         &par.aggregatetax,         COMMAND_TAXONOMY,
+                "Aggregate multiple taxon labels to a single label",
+                "# Download a sequence database with taxonomy information\n"
+                "mmseqs databases UniProtKB/Swiss-Prot swissprotDB tmp\n\n"
+                "# Create a nucleotide sequence database from FASTA\n"
+                "mmseqs createdb contigs.fasta contigsDb\n\n"
+                "# Extract all orfs from each contig and translate them\n"
+                "mmseqs extractorfs contigsDb orfsAaDb --translate\n\n"
+                "# Assign taxonomy to each orf\n"
+                "mmseqs taxonomy orfsAaDb swissprotDB taxPerOrf tmp\n\n"
+                "# Aggregate taxonomic assignments on each contig\n"
+                "mmseqs aggregatetax swissprotDB orfsAaDb_h taxPerOrf taxPerContig --majority 0.5\n\n",
+                "Eli Levy Karin <eli.levy.karin@gmail.com>",
+                "<i:taxSeqDB> <i:setToSeqMap> <i:taxResPerSeqDB> <o:taxResPerSetDB>",
+                CITATION_MMSEQS2, {{"taxSeqDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_TAXONOMY, &DbValidator::taxSequenceDb },
+                                                           {"setToSeqMap",   DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
+                                                           {"taxResPerSeqDB",   DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::taxResult },
+                                                           {"taxResPerSetDB",   DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::taxResult }}},
+        {"lca",                  lca,                  &par.lca,                  COMMAND_TAXONOMY,
+                "Compute the lowest common ancestor",
+                NULL,
+                "Milot Mirdita <milot@mirdita.de>",
+                "<i:targetDB> <i:resultDB> <o:taxaDB>",
+                CITATION_MMSEQS2, {{"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_TAXONOMY, &DbValidator::taxSequenceDb },
+                                          {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                          {"taxDB",    DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::taxResult }}},
+
+
+
         {"multihitdb",           multihitdb,           &par.multihitdb,           COMMAND_MULTIHIT,
-                "Create sequence database and associated metadata for multi hit searches",
+                "Create sequence DB for multi hit searches",
                 NULL,
                 "Ruoshi Zhang, Clovis Norroy & Milot Mirdita <milot@mirdita.de>",
-                "<i:fastaFile1[.gz]> ... <i:fastaFileN[.gz]> <o:setDB> <tmpDir>",
-                CITATION_MMSEQS2,{{"fast[a|q]File[.gz|bz]",  DbType::ACCESS_MODE_INPUT,  DbType::NEED_DATA | DbType::VARIADIC,  &DbValidator::flatfile },
-                                   {"setDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
-                                   {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}},
+                "<i:fastaFile1[.gz|bz2]> ... <i:fastaFileN[.gz|bz2]> <o:setDB> <tmpDir>",
+                CITATION_MMSEQS2, {{"fast[a|q]File[.gz|bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile },
+                                                           {"setDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
+                                                           {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}},
         {"multihitsearch",       multihitsearch,       &par.multihitsearch,       COMMAND_MULTIHIT,
                 "Search with a grouped set of sequences against another grouped set",
                 NULL,
                 "Ruoshi Zhang, Clovis Norroy & Milot Mirdita <milot@mirdita.de>",
                 "<i:querySetDB> <i:targetSetDB> <o:resultDB> <tmpDir>",
-                CITATION_MMSEQS2,{{"querySetDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"targetSetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb  },
-                                         {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
+                CITATION_MMSEQS2, {{"querySetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"targetSetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                                           {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
         {"besthitperset",        besthitperset,        &par.besthitbyset,         COMMAND_MULTIHIT,
-                "For each set of sequences compute the best element and updates the p-value",
+                "For each set of sequences compute the best element and update p-value",
                 NULL,
                 "Ruoshi Zhang, Clovis Norroy & Milot Mirdita <milot@mirdita.de>",
                 " <i:targetSetDB> <i:resultDB> <o:resultDB>",
-                CITATION_MMSEQS2,{{"querySetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                        {"targetSetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
-                                         {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb }}},
+                CITATION_MMSEQS2, {{"querySetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"targetSetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                                           {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb }}},
         {"combinepvalperset",    combinepvalperset,    &par.combinepvalbyset,     COMMAND_MULTIHIT,
                 "For each set compute the combined p-value",
                 NULL,
                 "Ruoshi Zhang, Clovis Norroy & Milot Mirdita <milot@mirdita.de>",
                 "<i:querySetDB> <i:targetSetDB> <i:resultDB> <o:pvalDB>",
-                CITATION_MMSEQS2,{{"querySetDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"targetSetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
-                                         {"pvalDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb },
-                                         {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
+                CITATION_MMSEQS2, {{"querySetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"targetSetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                                           {"pvalDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                                           {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
         {"mergeresultsbyset",    mergeresultsbyset,    &par.threadsandcompression,COMMAND_MULTIHIT,
-                "Merge results from multiple orfs back to their respective contig",
+                "Merge results from multiple ORFs back to their respective contig",
                 NULL,
                 "Ruoshi Zhang, Clovis Norroy & Milot Mirdita <milot@mirdita.de>",
                 "<i:setDB> <i:DB> <o:DB>",
-                CITATION_MMSEQS2,{{"setDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
-                                         {"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
-                                         {"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
-// Utility tools for clustering
-        {"clusterupdate",        clusterupdate,        &par.clusterUpdate,        COMMAND_MAIN,
-                "Update clustering of old sequence DB to clustering of new sequence DB",
-                NULL,
-                "Clovis Galiez & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:oldSequenceDB> <i:newSequenceDB> <i:oldClustResultDB> <o:newMappedSequenceDB> <o:newClustResultDB> <tmpDir>",
-                CITATION_MMSEQS2|CITATION_MMSEQS1,{{"oldSequenceDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER|DbType::NEED_LOOKUP, &DbValidator::sequenceDb },
-                                         {"newSequenceDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER|DbType::NEED_LOOKUP, &DbValidator::sequenceDb },
-                                         {"oldClustResultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA,  &DbValidator::clusterDb },
-                                         {"newMappedSequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb},
-                                         {"newClustResultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::clusterDb},
-                                         {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory}}},
-        {"createseqfiledb",      createseqfiledb,      &par.createseqfiledb,      COMMAND_FORMAT_CONVERSION,
-                "Create DB of unaligned FASTA files (1 per cluster) from sequence DB and cluster DB",
-                NULL,
-                "Milot Mirdita <milot@mirdita.de>",
-                "<i:sequenceDB> <i:clusterDB> <o:fastaDB>",
-                CITATION_MMSEQS2, {{"sequenceDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"clusterDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::clusterDb },
-                                         {"fastaDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::genericDb }}},
-        {"mergeclusters",        mergeclusters,        &par.threadsandcompression,COMMAND_CLUSTER,
-                "Merge multiple cluster DBs into single cluster DB",
+                CITATION_MMSEQS2, {{"setDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                                           {"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
+                                                           {"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
+
+
+
+        {"prefilter",            prefilter,            &par.prefilter,            COMMAND_PREFILTER,
+                "Double consecutive diagonal k-mer search",
                 NULL,
-                "Maria Hauser & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:sequenceDB> <o:clusterDB> <i:clusterDB1> ... <i:clusterDBn>",
-                CITATION_MMSEQS2, {{"sequenceDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"clusterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::clusterDb },
-                                         {"clusterDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::clusterDb }}},
-// Expert tools (for advanced users)
-        {"prefilter",            prefilter,            &par.prefilter,            COMMAND_EXPERT,
-                "Search with query sequence / profile DB through target DB (k-mer matching + ungapped alignment)",
-                "Searches with the sequences or profiles in query DB through the target sequence DB in two consecutive stages: a very fast k-mer matching stage (double matches on same diagonal) and a subsequent ungapped alignment stage. For each query a results file with sequence matches is written as entry into the prefilter DB.",
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de> & Maria Hauser",
                 "<i:queryDB> <i:targetDB> <o:prefilterDB>",
-                CITATION_MMSEQS2, {{"queryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"prefilterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::prefilterDb }}},
+                CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"prefilterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::prefilterDb }}},
 
-        {"ungappedprefilter",    ungappedprefilter,    &par.ungappedprefilter,    COMMAND_EXPERT,
-                "Search with query sequence / profile DB through target DB and compute optimal ungapped alignment score",
-                "Searches with the sequences or profiles in query DB through the target sequence DB. We compute ungapped alignment score for each diagonal. For each query a results file with sequence matches is written as entry into the prefilter DB.",
+        {"ungappedprefilter",    ungappedprefilter,    &par.ungappedprefilter,    COMMAND_PREFILTER,
+                "Optimal diagonal score search",
+                NULL,
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
                 "<i:queryDB> <i:targetDB> <o:prefilterDB>",
-                CITATION_MMSEQS2, {{"queryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"prefilterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::prefilterDb }}},
+        {"kmermatcher",          kmermatcher,          &par.kmermatcher,          COMMAND_PREFILTER,
+                "Find bottom-m-hashed k-mer matches within sequence DB",
+                NULL,
+                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:sequenceDB> <o:prefilterDB>",
+                CITATION_MMSEQS2,{{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
                                          {"prefilterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::prefilterDb }}},
-        {"align",                align,                &par.align,                COMMAND_EXPERT,
-                "Compute Smith-Waterman alignments for previous results (e.g. prefilter DB, cluster DB)",
-                "Calculates Smith-Waterman alignment scores between all sequences in the query database and the sequences of the target database which passed the prefiltering.",
+        {"kmersearch",           kmersearch,           &par.kmersearch,           COMMAND_PREFILTER,
+                "Find bottom-m-hashed k-mer matches between target and query DB",
+                NULL,
+                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:queryDB> <i:kmerIndexDB> <o:prefilterDB>",
+                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                         {"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::indexDb },
+                                         {"prefilterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::prefilterDb }}},
+        {"kmerindexdb",          kmerindexdb,          &par.kmerindexdb,          COMMAND_HIDDEN,
+                "Create bottom-m-hashed k-mer index",
+                NULL,
+                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de> ",
+                "<i:sequenceDB> <o:kmerIndexDB>",
+                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                         {"kmerIndexDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
+
+
+
+        {"align",                align,                &par.align,                COMMAND_ALIGNMENT,
+                "Optimal gapped local alignment",
+                NULL,
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de> & Maria Hauser",
                 "<i:queryDB> <i:targetDB> <i:resultDB> <o:alignmentDB>",
-                CITATION_MMSEQS2, {{"queryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
-                                         {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
-        {"alignall",             alignall,             &par.align,                COMMAND_EXPERT,
-                "Compute all against all Smith-Waterman alignments for a results (e.g. prefilter DB, cluster DB)",
-                "Calculates an all against all Smith-Waterman alignment scores between all sequences in a result. It reports all hits which passed the alignment criteria.",
+                CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                                           {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
+        {"alignall",             alignall,             &par.alignall,             COMMAND_ALIGNMENT,
+                "Within-result all-vs-all gapped local alignment",
+                NULL,
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
                 "<i:sequenceDB> <i:resultDB> <o:alignmentDB>",
                 CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
-                                         {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
-        {"transitivealign",             transitivealign,             &par.align,                COMMAND_EXPERT,
-                "Transfers alignments by transitivity via a center star alignment",
-                "It infers the alignment from sequence A->C via B, where B is the center sequence and A,C are aligned against B.",
+                                                           {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                                           {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
+        {"transitivealign",      transitivealign,      &par.align,                COMMAND_ALIGNMENT,
+                "Transfer alignments via transitivity",
+                //"Infer the alignment A->C via B, B being the center sequence and A,C each pairwise aligned against B",
+                NULL,
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
                 "<i:sequenceDB> <i:alignmentDB> <o:alignmentDB>",
                 CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
-                                         {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
+                                                           {"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                                           {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
+        {"rescorediagonal",     rescorediagonal,       &par.rescorediagonal,      COMMAND_ALIGNMENT,
+                "Compute sequence identity for diagonal",
+                NULL,
+                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:queryDB> <i:targetDB> <i:prefilterDB> <o:resultDB>",
+                CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                          {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
+        {"alignbykmer",         alignbykmer,           &par.alignbykmer,          COMMAND_ALIGNMENT,
+                "Heuristic gapped local k-mer based alignment",
+                NULL,
+                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:queryDB> <i:targetDB> <i:resultDB> <o:resultDB>",
+                CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                          {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
+
+
 
-        {"clust",                clust,                &par.clust,                COMMAND_EXPERT,
-                "Cluster sequence DB from alignment DB (e.g. created by searching DB against itself)",
-                "Computes a clustering of a sequence DB based on the alignment DB containing each query sequence or profile Smith-Waterman alignments generated by mmseqs align. (When given a prefilter DB as input, the tool will use the ungapped alignment scores for the clustering.) The tool reads the search results DB, constructs a similarity graph based on the matched sequences in alignment DB, and applies one of several clustering algorithms. The first, representative sequence of each cluster is connected by an edge to each cluster member. Its names are used as ID in the resulting cluster DB, the entries contain the names of all member sequences.",
+        {"clust",                clust,                &par.clust,                COMMAND_CLUSTER,
+                "Cluster result by Set-Cover/Connected-Component/Greedy-Incremental",
+                NULL,
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de> & Lars von den Driesch & Maria Hauser",
                 "<i:sequenceDB> <i:resultDB> <o:clusterDB>",
                 CITATION_MMSEQS2|CITATION_MMSEQS1,{{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
-                                         {"clusterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::clusterDb }}},
-        {"kmermatcher",          kmermatcher,          &par.kmermatcher,          COMMAND_EXPERT,
-                "Finds exact $k$-mers matches between sequences",
+                                                          {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                                          {"clusterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::clusterDb }}},
+        {"clusthash",            clusthash,            &par.clusthash,            COMMAND_CLUSTER,
+                "Hash-based clustering of equal length sequences",
                 NULL,
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de> ",
-                "<i:sequenceDB> <o:prefilterDB>",
-                CITATION_MMSEQS2,{{"sequenceDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"prefilterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::prefilterDb }}},
-        {"kmersearch",          kmersearch,          &par.kmersearch,          COMMAND_EXPERT,
-                "Search with query sequence through target DB.  (k-mer matching) ",
-                "Searches with the query sequence DB through the target sequence DB. Results are target centric target -> query.",
-                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de> ",
-                "<i:queryDB> <i:kmerIndexDB> <o:prefilterDB>",
-                CITATION_MMSEQS2, {{"sequenceDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"sequenceDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::indexDb },
-                                         {"prefilterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::prefilterDb }}},
-        {"kmerindexdb",          kmerindexdb,          &par.kmerindexdb,          COMMAND_EXPERT,
-                "Finds exact $k$-mers matches between sequences and stores them as index",
-                "Precomputes an index table for the sequence DB. Handing over the precomputed index table as input to mmseqs linsearch.",
-                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de> ",
-                "<i:sequenceDB> <o:kmerIndexDB>",
-                CITATION_MMSEQS2, {{"sequenceDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"kmerIndexDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
-        {"clusthash",            clusthash,            &par.clusthash,            COMMAND_EXPERT,
-                "Cluster sequences of same length and >90% sequence identity *in linear time*",
-                "Detects redundant sequences based on reduced alphabet hashing and hamming distance.",
-                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de> ",
                 "<i:sequenceDB> <o:alignmentDB>",
-                CITATION_MMSEQS2, {{"sequenceDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
-// Utility tools to manipulate DBs
-        {"compress",             compress,             &par.onlythreads,          COMMAND_DB,
-                "Compresses a database.",
+                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                          {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
+        {"mergeclusters",        mergeclusters,        &par.threadsandcompression,COMMAND_CLUSTER,
+                "Merge multiple cascaded clustering steps",
+                NULL,
+                "Maria Hauser & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:sequenceDB> <o:clusterDB> <i:clusterDB1> ... <i:clusterDBn>",
+                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                          {"clusterDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::clusterDb },
+                                                          {"clusterDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::clusterDb }}},
+
+
+
+        {"compress",             compress,             &par.onlythreads,          COMMAND_STORAGE,
+                "Compress DB entries",
                 NULL,
                 "Milot Mirdita <milot@mirdita.de>",
                 "<i:DB> <o:DB>",
                 CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
-                                         {"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
-        {"decompress",           decompress,           &par.onlythreads,          COMMAND_DB,
-                "Decompresses a database.",
+                                                           {"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
+        {"decompress",           decompress,           &par.onlythreads,          COMMAND_STORAGE,
+                "Decompress DB entries",
                 NULL,
                 "Milot Mirdita <milot@mirdita.de>",
                 "<i:DB> <o:DB>",
                 CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
-                                         {"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
-        {"apply",                apply,                &par.threadsandcompression,COMMAND_DB,
-                "Passes each input database entry to stdin of the specified program, executes it and writes its stdout to the output database.",
+                                                           {"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
+        {"rmdb",                 rmdb,                 &par.onlyverbosity,        COMMAND_STORAGE,
+                "Remove a DB",
                 NULL,
-                "Milot Mirdita <milot@mirdita.de>",
-                "<i:DB> <o:DB> -- program [args...]",
-                CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA,  &DbValidator::allDb },
-                                         {"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
-        {"extractorfs",          extractorfs,          &par.extractorfs,          COMMAND_DB,
-                "Extract open reading frames from all six frames from nucleotide sequence DB",
+                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:DB>",
+                CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL }}},
+        {"mvdb",                 mvdb,                 &par.onlyverbosity,        COMMAND_STORAGE,
+                "Move a DB",
                 NULL,
-                "Milot Mirdita <milot@mirdita.de>",
-                "<i:sequenceDB> <o:sequenceDB>",
-                CITATION_MMSEQS2,  {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
-        {"extractframes",        extractframes,        &par.extractframes,        COMMAND_DB,
-                "Extract frames reading frames from a nucleotide sequence DB",
+                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:srcDB> <o:dstDB>",
+                CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL },
+                                          {"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
+        {"touchdb",              touchdb,              &par.onlythreads,          COMMAND_STORAGE,
+                "Preload DB into memory (page cache)",
                 NULL,
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de> ",
-                "<i:sequenceDB> <o:sequenceDB>",
-                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA,  &DbValidator::sequenceDb },
-                                         {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
-        {"orftocontig",          orftocontig,          &par.orftocontig,          COMMAND_DB,
-                "Obtain location information of extracted orfs with respect to their contigs in alignment format",
-                "Parses extracted orfs headers to compute their coordinates on the contig and writes the results in alignment format",
-                "Eli Levy Karin <eli.levy.karin@gmail.com> ",
-                "<i:contigsSequenceDB> <i:extractedOrfsHeadersDB> <o:orfsAlignedToContigDB>",
-                CITATION_MMSEQS2,{{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}},
-        {"reverseseq",          reverseseq,          &par.reverseseq,          COMMAND_DB,
-                "Reverse each sequence in a DB",
-                "Reversed sequences can be used for the assessment of summary statistics computed for the input sequences",
-                "Eli Levy Karin <eli.levy.karin@gmail.com> ",
-                "<i:sequenceDB> <o:revSequenceDB>",
-                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
-        {"touchdb",              touchdb,              &par.onlythreads,          COMMAND_DB,
-                "Memory map database",
-                "Touch every system page of a database",
-                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de> ",
                 "<i:DB>",
                 CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
-        {"translatenucs",        translatenucs,        &par.translatenucs,        COMMAND_DB,
-                "Translate nucleotide sequence DB into protein sequence DB",
+
+
+        {"createsubdb",          createsubdb,          &par.createsubdb,          COMMAND_SET,
+                "Create a subset of a DB from list of DB keys",
+                "# Create a new sequenceDB from sequenceDB entries with keys 1, 2 and 3\n"
+                "mmseqs createsubdb <(printf '1\n2\n3\n') sequenceDB oneTwoThreeDB\n\n"
+                "# Create a new sequence database with representatives of clusterDB\n"
+                "mmseqs cluster sequenceDB clusterDB tmp\n"
+                "mmseqs createsubdb clusterDB sequenceDB representativesDB\n",
+                "Milot Mirdita <milot@mirdita.de>",
+                "<i:subsetFile|DB> <i:resultDB> <o:resultDB>",
+                CITATION_MMSEQS2, {{"subsetFile", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDbAndFlat },
+                                          {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
+                                          {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
+        {"concatdbs",            concatdbs,            &par.concatdbs,            COMMAND_SET,
+                "Concatenate two DBs, giving new IDs to entries from 2nd DB",
+//                "If exist, the auxillary files: _mapping, source and lookup are also concatenated after IDs update of the 2nd DB",
+                "# Download two sequences databases and concat them\n"
+                "mmseqs databases PDB pdbDB tmp\n"
+                "mmseqs UniProtKB/Swiss-Prot swissprotDB tmp\n"
+                "# Works only single threaded since seq. and header DB need the same ordering\n"
+                "mmseqs concatdbs pdbDB swissprotDB pdbAndSwissprotDB --threads 1\n"
+                "mmseqs concatdbs pdbDB_h swissprotDB_h pdbAndSwissprotDB_h --threads 1\n",
+                "Clovis Galiez, Eli Levy Karin & Martin Steinegger (martin.steinegger@mpibpc.mpg.de)",
+                "<i:DB> <i:DB> <o:DB>",
+                CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
+                                          {"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
+                                          {"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
+        {"splitdb",              splitdb,              &par.splitdb,              COMMAND_SET,
+                "Split DB into subsets",
                 NULL,
                 "Milot Mirdita <milot@mirdita.de>",
-                "<i:sequenceDB> <o:sequenceDB>",
-                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::nuclDb },
-                                         {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::aaDb }}},
-        {"translateaa",          translateaa,          &par.threadsandcompression,COMMAND_DB,
-                "Translate protein sequence into nucleotide sequence DB",
+                "<i:DB> <o:DB>",
+                CITATION_MMSEQS2,{{"allDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
+                                         {"allDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
+        {"mergedbs",             mergedbs,             &par.mergedbs,             COMMAND_SET,
+                "Merge entries from multiple DBs",
                 NULL,
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:sequenceDB> <o:sequenceDB>",
-                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA,  &DbValidator::aaDb },
-                                         {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::nuclDb }}},
-        {"swapresults",          swapresults,          &par.swapresult,           COMMAND_DB,
-                "Reformat prefilter or alignment DB as if target DB had been searched through query DB",
+                "<i:DB> <o:DB> <i:DB1> ... <i:DBn>",
+                CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
+                                          {"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb },
+                                          {"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::allDb }}},
+        {"subtractdbs",          subtractdbs,          &par.subtractdbs,          COMMAND_SET,
+                "Remove all entries from first DB occuring in second DB by key",
                 NULL,
-                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>, Clovis Galiez & Eli Levy Karin",
-                "<i:queryDB> <i:targetDB> <i:resultDB> <o:resultDB>",
-                CITATION_MMSEQS2, {{"queryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::prefAlnResDb },
-                                         {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::prefAlnResDb }}},
+                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:resultDBLeft> <i:resultDBRight> <o:resultDB>",
+                CITATION_MMSEQS2, {{"resultDBLeft", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                          {"resultDBRight", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                          {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb }}},
+
+
+
+        {"view",                 view,                 &par.view,                 COMMAND_DB,
+                "Print DB entries given in --id-list to stdout",
+                "# Print entries with keys 1, 2 and 3 from a sequence DB to stdout\n"
+                "mmseqs view sequenecDB --id-list 1,2,3\n",
+                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:DB>",
+                CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
+        {"apply",                apply,                &par.threadsandcompression,
+#ifdef __CYGWIN__
+                COMMAND_HIDDEN,
+#else
+                COMMAND_DB,
+#endif
+                "Execute given program on each DB entry",
+                "# Gather all sequences from a cluster DB\n"
+                "mmseqs createseqfiledb sequenceDB clusterDB unalignedDB --min-sequences 2\n"
+                "# Build MSAs with Clustal-Omega\n"
+                "mmseqs apply unalignedDB msaDB -- clustalo -i - -o stdout --threads=1\n\n"
+                "# Count lines in each DB entry inefficiently (result2stats is way faster)\n"
+                "mmseqs apply DB wcDB -- awk '{ counter++; } END { print counter; }'\n",
+                "Milot Mirdita <milot@mirdita.de>",
+                "<i:DB> <o:DB> -- program [args...]",
+                CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
+                                                           {"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
+        {"filterdb",             filterdb,             &par.filterDb,             COMMAND_DB,
+                "DB filtering by given conditions",
+                "# Retain top alignment for each query (alignment DBs are sorted by E-value)\n"
+                "mmseqs filterdb alignmentDB topHitAlignmentDB --extract-lines 1\n\n"
+                "# Extract alignments with Seq.id. greater than 90%\n"
+                "mmseqs filterdb alignmentDB scoreGreater35AlignmentDB --comparison-operator ge --comparison-value 0.9 --filter-column 2\n\n"
+                "# Retain all hits matching a regular expression\n"
+                "mmseqs filterdb alignmentDB regexFilteredDB --filter-regex '^[1-9].$' --filter-column 2\n\n"
+                "# Remove all hits to target keys contained in file db.index\n"
+                "mmseqs filterdb --filter-file db.index --positive-filter false\n\n"
+                "# Retain all hits matching any boolean expression\n"
+                "mmseqs filterdb --filter-expression '$1 * $2 >= 200'\n",
+                "Clovis Galiez & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:resultDB> <o:resultDB>",
+                CITATION_MMSEQS2, {{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
+                                          {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
         {"swapdb",               swapdb,               &par.swapdb,               COMMAND_DB,
-                "Create a DB where the key is from the first column of the input result DB",
+                "Transpose DB with integer values in first column",
                 NULL,
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>, Clovis Galiez & Eli Levy Karin",
                 "<i:resultDB> <o:resultDB>",
-                CITATION_MMSEQS2, {{"resultDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
-                                         {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb }}},
-        {"mergedbs",             mergedbs,             &par.mergedbs,             COMMAND_DB,
-                "Merge multiple DBs into a single DB, based on IDs (names) of entries",
+                CITATION_MMSEQS2, {{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
+                                          {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
+        {"prefixid",             prefixid,             &par.prefixid,             COMMAND_DB,
+                "For each entry in a DB prepend the entry key to the entry itself",
                 NULL,
-                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:sequenceDB> <o:resultDB> <i:resultDB1> ... <i:resultDBn>",
-                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
-                                         {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb },
-                                         {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::allDb }}},
-        {"splitdb",              splitdb,              &par.splitdb,              COMMAND_DB,
-                "Split a MMseqs2 DB into multiple DBs",
+                "Milot Mirdita <milot@mirdita.de>",
+                "<i:DB> <o:DB>",
+                CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
+                                          {"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
+        {"suffixid",             suffixid,             &par.prefixid,             COMMAND_DB,
+                "For each entry in a DB append the entry key to the entry itself",
                 NULL,
                 "Milot Mirdita <milot@mirdita.de>",
-                "<i:sequenceDB> <o:sequenceDB_1..N>",
-                CITATION_MMSEQS2,{{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb }}},
-        {"splitsequence",       splitsequence,         &par.splitsequence,        COMMAND_DB,
-                "Split sequences by length",
+                "<i:resultDB> <o:resultDB>",
+                CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
+                                          {"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
+
+
+        {"extractorfs",          extractorfs,          &par.extractorfs,          COMMAND_SEQUENCE,
+                "Six-frame extraction of open reading frames",
                 NULL,
-                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "Milot Mirdita <milot@mirdita.de>",
                 "<i:sequenceDB> <o:sequenceDB>",
                 CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
-        {"subtractdbs",          subtractdbs,          &par.subtractdbs,          COMMAND_DB,
-                "Generate a DB with entries of first DB not occurring in second DB",
+                                                           {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
+        {"extractframes",        extractframes,        &par.extractframes,        COMMAND_SEQUENCE,
+                "Extract frames from a nucleotide sequence DB",
                 NULL,
-                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:resultDBLeft> <i:resultDBRight> <o:resultDB>",
-                CITATION_MMSEQS2, {{"resultDBLeft", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
-                                         {"resultDBRight", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
-                                         {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb }}},
-        {"filterdb",             filterdb,             &par.filterDb,             COMMAND_DB,
-                "Filter a DB by conditioning (regex, numerical, ...) on one of its whitespace-separated columns",
+                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de> ",
+                "<i:sequenceDB> <o:sequenceDB>",
+                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
+        //TODO remove later?
+        {"orftocontig",          orftocontig,          &par.orftocontig,          COMMAND_SEQUENCE,
+                "Write ORF locations in alignment format",
+                NULL,
+                "Eli Levy Karin <eli.levy.karin@gmail.com> ",
+                "<i:contigsSequenceDB> <i:extractedOrfsHeadersDB> <o:orfsAlignedToContigDB>",
+                CITATION_MMSEQS2, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}},
+        {"reverseseq",          reverseseq,            &par.reverseseq,           COMMAND_SEQUENCE,
+                "Reverse (without complement) sequences",
+                NULL,
+                "Eli Levy Karin <eli.levy.karin@gmail.com> ",
+                "<i:sequenceDB> <o:revSequenceDB>",
+                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
+        {"translatenucs",        translatenucs,        &par.translatenucs,        COMMAND_SEQUENCE,
+                "Translate nucleotides to proteins",
                 NULL,
-                "Clovis Galiez & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:resultDB> <o:resultDB>",
-                CITATION_MMSEQS2, {{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
-                                         {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
-        {"createsubdb",          createsubdb,          &par.createsubdb,        COMMAND_DB,
-                "Create a subset of a DB from a file of IDs of entries",
-                "This module creates a subset of the input database, containing only the entries that are specified in the subset file. The subset file consists of numeric identifiers separated by new lines",
                 "Milot Mirdita <milot@mirdita.de>",
-                "<i:subsetFile or DB> <i:resultDB> <o:resultDB>",
-                CITATION_MMSEQS2, {{"subsetFile", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDbAndFlat },
-                                         {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
-                                         {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
-        {"view",          view,          &par.view,        COMMAND_DB,
-                "Prints entries to console",
+                "<i:sequenceDB> <o:sequenceDB>",
+                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::nuclDb },
+                                                           {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::aaDb }}},
+        {"translateaa",          translateaa,          &par.threadsandcompression,COMMAND_SEQUENCE,
+                "Translate proteins to lexicographically lowest codons",
                 NULL,
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:DB>",
-                CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
-        {"rmdb",          rmdb,          &par.onlyverbosity,        COMMAND_DB,
-                "Removes the database",
+                "<i:sequenceDB> <o:sequenceDB>",
+                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::aaDb },
+                                                           {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::nuclDb }}},
+        // TODO add SEQUENCE_SPLIT_MODE_SOFT
+        {"splitsequence",       splitsequence,         &par.splitsequence,        COMMAND_SEQUENCE,
+                "Split sequences by length",
                 NULL,
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:DB>",
-                CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL }}},
-        {"mvdb",          mvdb,          &par.onlyverbosity,        COMMAND_DB,
-                "Move the database",
+                "<i:sequenceDB> <o:sequenceDB>",
+                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
+        {"masksequence",        masksequence,          &par.threadsandcompression,COMMAND_SEQUENCE,
+                "Soft mask sequence DB using tantan",
+//                "Low. complex regions are masked as lower case characters. The remaining regions are printed as upper case characters.",
                 NULL,
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:srcDB> <o:dstDB>",
-                CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL },
-                                         {"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
-        {"result2profile",       result2profile,       &par.result2profile,       COMMAND_DB,
-                "Compute profile and consensus DB from a prefilter, alignment or cluster DB",
+                "<i:sequenceDB> <o:sequenceDB>",
+                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
+        {"extractalignedregion", extractalignedregion, &par.extractalignedregion, COMMAND_SEQUENCE,
+                "Extract aligned sequence region from query",
                 NULL,
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:queryDB> <i:targetDB> <i:resultDB> <o:profileDB>",
-                CITATION_MMSEQS2,{{"queryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
-                                         {"profileDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::profileDb }}},
-        {"result2pp",            result2pp,            &par.result2pp,            COMMAND_DB,
-                "Merge the query profiles with target profiles according to search results and outputs an enriched profile DB",
+                "<i:queryDB> <i:targetDB> <i:resultDB> <o:sequenceDB>",
+                CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                          {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb}}},
+
+
+
+        {"swapresults",          swapresults,          &par.swapresult,           COMMAND_RESULT,
+                "Transpose prefilter/alignment DB",
                 NULL,
-                "Clovis Galiez & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:queryDB> <i:targetDB> <i:resultDB> <o:profileDB>",
-                CITATION_MMSEQS2,{{"queryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::profileDb },
-                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::profileDb },
-                                         {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
-                                         {"profileDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::profileDb }}},
-        {"result2rbh",           result2rbh,           &par.threadsandcompression,COMMAND_DB,
+                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>, Clovis Galiez & Eli Levy Karin",
+                "<i:queryDB> <i:targetDB> <i:resultDB> <o:resultDB>",
+                CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::prefAlnResDb },
+                                                           {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::prefAlnResDb }}},
+        {"result2rbh",           result2rbh,           &par.threadsandcompression,COMMAND_RESULT,
                 "Filter a merged result DB to retain only reciprocal best hits",
                 NULL,
                 "Eli Levy Karin",
                 "<i:resultDB> <o:resultDB>",
                 CITATION_MMSEQS2, {{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
-                                         {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb }}},
-        {"result2msa",           result2msa,           &par.result2msa,           COMMAND_DB,
-                "Generate MSAs for queries by locally aligning their matched targets in prefilter/alignment/cluster DB",
+                                          {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb }}},
+        {"result2msa",           result2msa,           &par.result2msa,           COMMAND_RESULT,
+                "Compute MSA DB from a result DB",
                 NULL,
                 "Martin Steinegger (martin.steinegger@mpibpc.mpg.de) & Milot Mirdita <milot@mirdita.de> & Clovis Galiez",
                 "<i:queryDB> <i:targetDB> <i:resultDB> <o:msaDB>",
-                CITATION_MMSEQS2,{{"queryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                CITATION_MMSEQS2,{{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                          {"msaDB",    DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::msaDb }}},
+        {"result2dnamsa",           result2dnamsa,           &par.result2dnamsa,           COMMAND_RESULT,
+                "Compute MSA DB with out insertions in the query for DNA sequences",
+                NULL,
+                "Martin Steinegger (martin.steinegger@mpibpc.mpg.de)",
+                "<i:queryDB> <i:targetDB> <i:resultDB> <o:msaDB>",
+                CITATION_MMSEQS2,{{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
                                          {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
                                          {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
                                          {"msaDB",    DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::msaDb }}},
-        {"convertmsa",           convertmsa,           &par.convertmsa,           COMMAND_DB,
-                "Turns an MSA file into an MSA database.",
-                "Builds an MSA database out of an MSA file in either Stockholm or PFAM format.",
+        {"result2stats",         result2stats,         &par.result2stats,         COMMAND_RESULT,
+                "Compute statistics for each entry in a DB",
+                NULL,
+                "Clovis Galiez & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:queryDB> <i:targetDB> <i:resultDB> <o:statsDB>",
+                CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
+                                          {"statsDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::genericDb }}},
+        {"offsetalignment",      offsetalignment,      &par.offsetalignment,      COMMAND_RESULT,
+                "Offset alignment by ORF start position",
+                NULL,
+                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:queryDB> <i:queryOrfDB> <i:targetDB> <i:targetOrfDB> <i:alnDB> <o:alnDB>",
+                CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"queryOrfDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"targetOrfDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"alnDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                          {"alnDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
+        {"proteinaln2nucl",      proteinaln2nucl,      &par.threadsandcompression,COMMAND_RESULT,
+                "Transform protein alignments to nucleotide alignments",
+                NULL,
+                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de> ",
+                "<i:nuclQueryDB> <i:nuclTargetDB> <i:aaQueryDB> <i:aaTargetDB> <i:alnDB> <o:alnDB>",
+                CITATION_MMSEQS2, {{"nuclQueryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"nuclTargetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"aaQueryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"aaTargetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"alnDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                          {"alnDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
+        {"result2repseq",       result2repseq,         &par.threadsandcompression,COMMAND_RESULT,
+                "Get representative sequences from result DB",
+                NULL,
+                "Milot Mirdita <milot@mirdita.de> & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:sequenceDB> <i:resultDB> <o:sequenceDb>",
+                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                          {"sequenceDb", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
+        {"sortresult",           sortresult,           &par.sortresult,           COMMAND_RESULT,
+                "Sort a result DB in the same order as the prefilter or align module",
+                NULL,
                 "Milot Mirdita <milot@mirdita.de>",
-                "<i:msaFile[.gz]> <o:msaDB>",
-                CITATION_SERVER |CITATION_MMSEQS2, {{"msaFile[.gz]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile },
-                                         {"msaDB",DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::msaDb }}},
-        {"msa2profile",          msa2profile,          &par.msa2profile,          COMMAND_DB,
-                "Turns an MSA database into a MMseqs profile database.",
-                "Builds a profile database from a database containing MSAs. The first sequence in the MSA is chosen as the query sequence. Gap columns (insertions) are discarded.",
+                "<i:resultbDB> <o:resultDB>",
+                CITATION_MMSEQS2, {{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                          {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb }}},
+        {"summarizealis",      summarizealis,      &par.threadsandcompression,     COMMAND_RESULT,
+                "Summarize alignment result to one row (uniq. cov., cov., avg. seq. id.)",
+                NULL,
+                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:alignmentDB> <o:summerizedDB>",
+                CITATION_MMSEQS2|CITATION_UNICLUST, {{"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                          {"summerizedDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::genericDb }}},
+        {"summarizeresult",      summarizeresult,      &par.summarizeresult,      COMMAND_RESULT,
+                "Extract annotations from alignment DB",
+                NULL,
+                "Milot Mirdita <milot@mirdita.de> & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:alignmentDB> <o:alignmentDB>",
+                CITATION_MMSEQS2|CITATION_UNICLUST, {{"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                          {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
+
+
+
+        {"result2profile",       result2profile,       &par.result2profile,       COMMAND_PROFILE,
+                "Compute profile DB from a result DB",
+                NULL,
+                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:queryDB> <i:targetDB> <i:resultDB> <o:profileDB>",
+                CITATION_MMSEQS2,{{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                                           {"profileDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::profileDb }}},
+        {"msa2profile",          msa2profile,          &par.msa2profile,          COMMAND_PROFILE | COMMAND_DATABASE_CREATION,
+                "Convert a MSA DB to a profile DB",
+                "# Convert globally aligned MSAs to profiles\n"
+                "# Defines columns as match columns if more than 50% of residues are not gaps\n"
+                "# Non-match columns are discarded\n"
+                "mmseqs msa2profile msaDB profileDB --match-mode 1 --match-ratio 0.5\n\n"
+                "# Assign match-columns through the first sequence\n"
+                "# Gaps in query sequence define non-match columns and are discarded\n"
+                "mmseqs msa2profile msaDB profileDB --match-mode 0\n",
                 "Milot Mirdita <milot@mirdita.de>",
                 "<i:msaDB> <o:profileDB>",
                 CITATION_SERVER |CITATION_MMSEQS2, {{"msaDB",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::msaDb },
-                                         {"profileDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::profileDb }}},
-        {"profile2pssm",         profile2pssm,         &par.profile2pssm,         COMMAND_DB,
-                "Converts a profile database into a human readable tab-separated PSSM file.",
+                                                           {"profileDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::profileDb }}},
+        {"profile2pssm",         profile2pssm,         &par.profile2pssm,         COMMAND_PROFILE,
+                "Convert a profile DB to a tab-separated PSSM file",
                 NULL,
                 "Milot Mirdita <milot@mirdita.de>",
                 "<i:profileDB> <o:pssmFile>",
                 CITATION_MMSEQS2, {{"profileDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::profileDb },
-                                         {"pssmFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::genericDb }}},
-        {"profile2consensus",    profile2consensus,    &par.profile2seq,          COMMAND_DB,
-                "Extracts consensus sequence database from profile database",
+                                                           {"pssmFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::genericDb }}},
+        {"profile2consensus",    profile2consensus,    &par.profile2seq,          COMMAND_PROFILE,
+                "Extract consensus sequence DB from a profile DB",
                 NULL,
                 "Milot Mirdita <milot@mirdita.de>",
                 "<i:profileDB> <o:sequenceDB>",
                 CITATION_MMSEQS2, {{"profileDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::profileDb },
-                                          {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::aaDb }}},
-        {"profile2repseq",       profile2repseq,       &par.profile2seq,          COMMAND_DB,
-                "Extracts representative sequence database from profile database",
+                                                           {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::aaDb }}},
+        {"profile2repseq",       profile2repseq,       &par.profile2seq,          COMMAND_PROFILE,
+                "Extract representative sequence DB from a profile DB",
                 NULL,
                 "Milot Mirdita <milot@mirdita.de>",
                 "<i:profileDB> <o:sequenceDB>",
                 CITATION_MMSEQS2, {{"profileDB",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::aaDb },
-                                          {"pssmFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::genericDb }}},
-        {"profile2cs",         profile2cs,             &par.profile2cs,           COMMAND_DB,
-                "Converts a profile database into a column state sequence.",
+                                                           {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::aaDb }}},
+        {"convertprofiledb",     convertprofiledb,     &par.convertprofiledb,     COMMAND_PROFILE,
+                "Convert a HH-suite HHM DB to a profile DB",
                 NULL,
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:profileDB> <o:csDB>",
-                CITATION_MMSEQS2, {{"profileDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::profileDb },
-                                         {"csDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA,  &DbValidator::csDb }}},
-        {"result2stats",         result2stats,         &par.result2stats,         COMMAND_DB,
-                "Compute statistics for each entry in a sequence, prefilter, alignment or cluster DB",
-                NULL,
-                "Clovis Galiez & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:queryDB> <i:targetDB> <i:resultDB> <o:statsDB>",
-                CITATION_MMSEQS2, {{"queryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::prefAlnResDb },
-                                         {"statsDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::genericDb }}},
-        {"offsetalignment",      offsetalignment,      &par.offsetalignment,      COMMAND_HIDDEN,
-                "Offset alignment by orf start position.",
-                NULL,
-                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:queryDB> <i:queryOrfDB> <i:targetDB> <i:targetOrfDB> <i:alnDB> <o:alnDB>",
-                CITATION_MMSEQS2, {{"queryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"queryOrfDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"targetOrfDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"alnDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
-                                         {"alnDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
-        {"proteinaln2nucl",      proteinaln2nucl,      &par.threadsandcompression,COMMAND_DB,
-                "Map protein alignment to nucleotide alignment",
-                NULL,
-                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de> ",
-                "<i:nuclQueryDB> <i:nuclTargetDB> <i:aaQueryDB> <i:aaTargetDB> <i:alnDB> <o:alnDB>",
-                CITATION_MMSEQS2, {{"nuclQueryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"nuclTargetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"aaQueryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"aaTargetDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"alnDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
-                                         {"alnDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
-        {"tsv2db",               tsv2db,               &par.tsv2db,               COMMAND_DB,
-                "Turns a TSV file into a MMseqs database",
+                "<i:hhsuiteHHMDB> <o:profileDB>",
+                CITATION_MMSEQS2,{{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}},
+
+
+
+        {"enrich",                enrich,              &par.enrichworkflow,       COMMAND_PROFILE_PROFILE,
+                "Boost diversity of search result",
                 NULL,
                 "Milot Mirdita <milot@mirdita.de>",
-                "<i:tsvFile> <o:resultDB>",
-                CITATION_MMSEQS2,  {{"tsvFile",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile },
-                                         {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb }}},
-        {"result2repseq",       result2repseq,         &par.threadsandcompression,COMMAND_DB,
-                "Get representative sequences for a result database",
-                NULL,
-                "Milot Mirdita <milot@mirdita.de> & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:sequenceDB> <i:resultDB> <o:sequenceDb>",
-                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
-                                         {"sequenceDb", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
-// Special-purpose utilities
-        {"rescorediagonal",     rescorediagonal,       &par.rescorediagonal,      COMMAND_SPECIAL,
-                "Compute sequence identity for diagonal",
+                "<i:queryDB> <i:targetDB> <o:alignmentDB> <tmpDir>",
+                CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                          {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
+        {"result2pp",            result2pp,            &par.result2pp,            COMMAND_PROFILE_PROFILE,
+                "Merge two profile DBs by shared hits",
                 NULL,
-                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:queryDB> <i:targetDB> <i:prefilterDB> <o:resultDB>",
-                CITATION_MMSEQS2, {{"queryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                "Clovis Galiez & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:queryDB> <i:targetDB> <i:resultDB> <o:profileDB>",
+                CITATION_MMSEQS2,{{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::profileDb },
+                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::profileDb },
                                          {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
-                                         {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
-        {"alignbykmer",         alignbykmer,           &par.alignbykmer,          COMMAND_SPECIAL,
-                "Predict sequence identity, score, alignment start and end by kmer alignment",
+                                         {"profileDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::profileDb }}},
+        {"profile2cs",         profile2cs,             &par.profile2cs,           COMMAND_PROFILE_PROFILE,
+                "Convert a profile DB into a column state sequence DB",
                 NULL,
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:queryDB> <i:targetDB> <i:resultDB> <o:resultDB>",
-                CITATION_MMSEQS2, {{"queryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
-                                         {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
-        {"diffseqdbs",           diffseqdbs,           &par.diff,                 COMMAND_SPECIAL,
-                "Find IDs of sequences kept, added and removed between two versions of sequence DB",
-                "It creates 3 filtering files, that can be used in conjunction with \"createsubdb\" tool.\nThe first file contains the keys that has been removed from DBold to DBnew.\nThe second file maps the keys of the kept sequences from DBold to DBnew.\nThe third file contains the keys of the sequences that have been added in DBnew.",
-                "Clovis Galiez & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:oldSequenceDB> <i:newSequenceDB> <o:rmSeqKeysFile> <o:keptSeqKeysFile> <o:newSeqKeysFile>",
-                CITATION_MMSEQS2, {{"oldSequenceDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"newSequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"rmSeqKeysFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
-                                         {"keptSeqKeysFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
-                                         {"newSeqKeysFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }}},
-        {"concatdbs",            concatdbs,            &par.concatdbs,            COMMAND_SPECIAL,
-                "Concatenate two DBs, giving new IDs to entries from second input DB",
-                NULL,
-                "Clovis Galiez & Martin Steinegger (martin.steinegger@mpibpc.mpg.de)",
-                "<i:DB> <i:DB> <o:DB>",
-                CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
-                                         {"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
-                                         {"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
-        {"sortresult",           sortresult,           &par.sortresult,           COMMAND_SPECIAL,
-                "Sort a result database in the same order as prefilter or align would.",
+                "<i:profileDB> <o:csDB>",
+                CITATION_MMSEQS2, {{"profileDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::profileDb },
+                                         {"csDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::csDb }}},
+        {"convertca3m",          convertca3m,          &par.threadsandcompression,COMMAND_PROFILE_PROFILE,
+                "Convert a cA3M DB to a result DB",
                 NULL,
                 "Milot Mirdita <milot@mirdita.de>",
-                "<i:resultbDB> <o:resultDB>",
-                CITATION_MMSEQS2, {{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
-                                         {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb }}},
-        {"summarizealis",      summarizealis,      &par.threadsandcompression,      COMMAND_SPECIAL,
-                "Summarize alignment results into a single show uniq. coverage, coverage and avg. sequence identity",
+                "<i:ca3mDB> <o:resultDB>",
+                CITATION_MMSEQS2, {{"ca3mDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::ca3mDb },
+                                          {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb }}},
+        {"expandaln",           expandaln,             &par.expandaln,            COMMAND_PROFILE_PROFILE,
+                "Expand an alignment result based on another",
                 NULL,
-                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:alignmentDB> <o:summerizedDB>",
-                CITATION_MMSEQS2|CITATION_UNICLUST, {{"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
-                                         {"summerizedDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::genericDb }}},
-        {"summarizeresult",      summarizeresult,      &par.summarizeresult,      COMMAND_SPECIAL,
-                "Extract annotations from alignment DB",
+                "Milot Mirdita <milot@mirdita.de>",
+                "<i:queryDB> <i:targetDB> <i:resultDB> <i:resultDB|ca3mDB> <o:alignmentDB>",
+                CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                          {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                          {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
+
+
+
+        {"diffseqdbs",           diffseqdbs,           &par.diff,                 COMMAND_SPECIAL,
+                "Compute diff of two sequence DBs",
+//                "It creates 3 filtering files, that can be used in conjunction with \"createsubdb\" tool.\nThe first file contains the keys that has been removed from DBold to DBnew.\nThe second file maps the keys of the kept sequences from DBold to DBnew.\nThe third file contains the keys of the sequences that have been added in DBnew.",
                 NULL,
-                "Milot Mirdita <milot@mirdita.de> & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:alignmentDB> <o:alignmentDB>",
-                CITATION_MMSEQS2|CITATION_UNICLUST, {{"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
-                                         {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
+                "Clovis Galiez & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
+                "<i:oldSequenceDB> <i:newSequenceDB> <o:rmSeqKeysFile> <o:keptSeqKeysFile> <o:newSeqKeysFile>",
+                CITATION_MMSEQS2, {{"oldSequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"newSequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"rmSeqKeysFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
+                                                           {"keptSeqKeysFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
+                                                           {"newSeqKeysFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }}},
         {"summarizetabs",        summarizetabs,        &par.summarizetabs,        COMMAND_SPECIAL,
-                "Extract annotations from HHblits BAST-tab-formatted results",
+                "Extract annotations from HHblits BLAST-tab-formatted results",
                 NULL,
                 "Milot Mirdita <milot@mirdita.de> & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
                 "<i:blastTabDB> <i:lengthFile> <o:summarizedBlastTabDB>",
                 CITATION_MMSEQS2|CITATION_UNICLUST,{{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}},
-        {"gff2db",               gff2db,               &par.gff2ffindex,          COMMAND_SPECIAL,
-                "Extract regions from a sequence database based on a gff3 (generic feature format) file",
+        {"gff2db",               gff2db,               &par.gff2db,               COMMAND_SPECIAL,
+                "Extract regions from a sequence database based on a GFF3 file",
                 NULL,
                 "Milot Mirdita <milot@mirdita.de>",
                 "<i:gff3File> <i:sequenceDB> <o:sequenceDB>",
                 CITATION_MMSEQS2, {{"gff3File", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile },
-                                         {"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
-        {"masksequence",            masksequence,            &par.threadsandcompression,          COMMAND_SPECIAL,
-                "Soft mask sequences using tantan, low. complex regions in lower case the rest upper ",
-                NULL,
-                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:sequenceDB> <o:sequenceDB>",
-                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
-        {"maskbygff",            maskbygff,            &par.gff2ffindex,          COMMAND_SPECIAL,
-                "X out sequence regions in a sequence DB by features in a gff3 file",
+                                                           {"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
+        {"maskbygff",            maskbygff,            &par.gff2db,               COMMAND_SPECIAL,
+                "Mask out sequence regions in a sequence DB by features selected from a GFF3 file",
                 NULL,
                 "Milot Mirdita <milot@mirdita.de>",
                 "<i:gff3File> <i:sequenceDB> <o:sequenceDB>",
                 CITATION_MMSEQS2, {{"gff3File", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::flatfile },
-                                         {"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
-        {"prefixid",             prefixid,             &par.prefixid,             COMMAND_SPECIAL,
-                "For each entry in a DB prepend the entry ID to the entry itself",
-                NULL,
-                "Milot Mirdita <milot@mirdita.de>",
-                "<i:DB> <o:DB>",
-                CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb },
-                                         {"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA,  &DbValidator::allDb }}},
-        {"suffixid",             suffixid,             &par.prefixid,             COMMAND_SPECIAL,
-                "For each entry in a DB append the entry ID to the entry itself",
-                NULL,
-                "Milot Mirdita <milot@mirdita.de>",
-                "<i:resultDB> <o:resultDB>",
-                CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA,  &DbValidator::allDb },
-                                         {"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA,  &DbValidator::allDb }}},
+                                                           {"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"sequenceDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
         {"convertkb",            convertkb,            &par.convertkb,            COMMAND_SPECIAL,
-                "Convert UniProt knowledge base files into MMseqs2 database format for the selected column types",
+                "Convert UniProtKB data to a DB",
                 NULL,
                 "Milot Mirdita <milot@mirdita.de>",
                 "<i:uniprotkb.dat[.gz]> ... <i:uniprotkb.dat[.gz]> <o:uniprotkbDB>",
-                CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC,  &DbValidator::flatfile },
-                                   {"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA,  &DbValidator::genericDb }}},
+                CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA | DbType::VARIADIC, &DbValidator::flatfile },
+                                                           {"DB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::genericDb }}},
         {"summarizeheaders",     summarizeheaders,     &par.summarizeheaders,     COMMAND_SPECIAL,
-                "Return a new summarized header DB from the UniProt headers of a cluster DB",
+                "Summarize FASTA headers of result DB",
                 NULL,
                 "Milot Mirdita <milot@mirdita.de>",
-                "<i:queryDB> <i:targetDB> <i:clusterDB> <o:headerDB>",
-                CITATION_MMSEQS2|CITATION_UNICLUST, {{"queryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA,  &DbValidator::sequenceDb },
-                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"clusterDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA,  &DbValidator::resultDb },
-                                         {"headerDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::genericDb}}},
-        {"extractalignedregion", extractalignedregion, &par.extractalignedregion, COMMAND_SPECIAL,
-                "Extract aligned sequence region from query",
-                NULL,
-                "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
-                "<i:queryDB> <i:targetDB> <i:resultDB> <o:sequenceDB>",
-                CITATION_MMSEQS2, {{"queryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
-                                         {"queryDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::genericDb}}},
+                "<i:queryDB> <i:targetDB> <i:resultDb> <o:headerDB>",
+                CITATION_MMSEQS2|CITATION_UNICLUST, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                           {"resultDb", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                                           {"headerDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::genericDb}}},
         {"extractdomains",       extractdomains,       &par.extractdomains,       COMMAND_SPECIAL,
-                "Extract highest scoring alignment region for each sequence from BLAST-tab file",
+                "Extract highest scoring alignment regions for each sequence from BLAST-tab file",
                 NULL,
                 "Milot Mirdita <milot@mirdita.de> & Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
                 "<i:alignmentDb> <i:msaDB> <o:domainDB>",
-                CITATION_MMSEQS2|CITATION_UNICLUST, {{"alignmentDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
-                                         {"msaDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::msaDb },
-                                         {"domainDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::genericDb}}},
-        {"convertca3m",          convertca3m,          &par.threadsandcompression,COMMAND_SPECIAL,
-                "Converts a cA3M database into a MMseqs2 result database.",
-                NULL,
-                "Milot Mirdita <milot@mirdita.de>",
-                "<i:ca3mDB> <o:resultDB>",
-                CITATION_MMSEQS2, {{"ca3mDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::ca3mDb },
-                                         {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb }}},
-        {"expandaln",           expandaln,             &par.expandaln,            COMMAND_SPECIAL,
-                "Expands an alignment result based on another.",
-                NULL,
-                "Milot Mirdita <milot@mirdita.de>",
-                "<i:queryDB> <i:targetDB> <i:resultDB> <i:resultDB|ca3mDB> <o:alignmentDB>",
-                CITATION_MMSEQS2, {{"queryDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
-                                         {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
-                                         {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
-                                         {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
+                CITATION_MMSEQS2|CITATION_UNICLUST, {{"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                                           {"msaDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::msaDb },
+                                                           {"domainDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::genericDb}}},
         {"countkmer",           countkmer,             &par.countkmer,            COMMAND_SPECIAL,
-                "Simple kmer counter, it prints the numeric, alphanumeric representation and kmercount",
+                "Count k-mers",
                 NULL,
                 "Martin Steinegger <martin.steinegger@mpibpc.mpg.de>",
                 "<i:sequenceDB> ",
-                CITATION_MMSEQS2, {{"sequenceDB",  DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
+                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }}},
+
+
+
         {"dbtype",              dbtype,                &par.empty,                COMMAND_HIDDEN,
                 "",
                 NULL,
@@ -820,7 +1100,7 @@ std::vector<Command> baseCommands = {
                 "",
                 CITATION_MMSEQS2, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}},
         {"diskspaceavail",       diskspaceavail,       &par.empty,                COMMAND_HIDDEN,
-                "available disk space in bytes",
+                "Show available disk space in bytes",
                 NULL,
                 "",
                 "",
diff --git a/src/alignment/Alignment.cpp b/src/alignment/Alignment.cpp
index b49df9f..3ce5144 100644
--- a/src/alignment/Alignment.cpp
+++ b/src/alignment/Alignment.cpp
@@ -79,7 +79,7 @@ Alignment::Alignment(const std::string &querySeqDB,
 //        }
         alignmentMode = (alignmentMode > Parameters::ALIGNMENT_MODE_SCORE_COV) ? alignmentMode : Parameters::ALIGNMENT_MODE_SCORE_COV;
     }
-    initSWMode(alignmentMode);
+    swMode = initSWMode(alignmentMode, par.covThr, par.seqIdThr);
 
     if (par.wrappedScoring)
     {
@@ -130,24 +130,19 @@ Alignment::Alignment(const std::string &querySeqDB,
 
     if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) {
         m = new NucleotideMatrix(par.scoringMatrixFile.nucleotides, 1.0, scoreBias);
-        gapOpen = par.gapOpen;
-        gapExtend = par.gapExtend;
-        if(par.PARAM_GAP_OPEN.wasSet==false){
-            gapOpen = 5;
-        }
-        if(par.PARAM_GAP_EXTEND.wasSet==false){
-            gapExtend = 2;
-        }
+        gapOpen = par.gapOpen.nucleotides;
+        gapExtend = par.gapExtend.nucleotides;
+        zdrop = par.zdrop;
     } else if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_PROFILE_STATE_PROFILE)){
         SubstitutionMatrix s(par.scoringMatrixFile.aminoacids, 2.0, scoreBias);
         this->m = new SubstitutionMatrixProfileStates(s.matrixName, s.probMatrix, s.pBack, s.subMatrixPseudoCounts, 2.0, scoreBias, 219);
-        gapOpen = par.gapOpen;
-        gapExtend = par.gapExtend;
+        gapOpen = par.gapOpen.aminoacids;
+        gapExtend = par.gapExtend.aminoacids;
     } else {
         // keep score bias at 0.0 (improved ROC)
         m = new SubstitutionMatrix(par.scoringMatrixFile.aminoacids, 2.0, scoreBias);
-        gapOpen = par.gapOpen;
-        gapExtend = par.gapExtend;
+        gapOpen = par.gapOpen.aminoacids;
+        gapExtend = par.gapExtend.aminoacids;
     }
 
     if (realign == true) {
@@ -161,7 +156,8 @@ Alignment::Alignment(const std::string &querySeqDB,
     }
 }
 
-void Alignment::initSWMode(unsigned int alignmentMode) {
+unsigned int Alignment::initSWMode(unsigned int alignmentMode, float covThr, float seqIdThr) {
+    unsigned int swMode = Matcher::SCORE_ONLY;
     switch (alignmentMode) {
         case Parameters::ALIGNMENT_MODE_FAST_AUTO:
             if(covThr > 0.0 && seqIdThr == 0.0) {
@@ -198,6 +194,7 @@ void Alignment::initSWMode(unsigned int alignmentMode) {
             Debug(Debug::ERROR) << "Wrong swMode mode\n";
             EXIT(EXIT_FAILURE);
     }
+    return swMode;
 }
 
 Alignment::~Alignment() {
@@ -294,10 +291,14 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex,
             char buffer[1024+32768];
             Sequence qSeq(maxSeqLen, querySeqType, m, 0, false, compBiasCorrection);
             Sequence dbSeq(maxSeqLen, targetSeqType, m, 0, false, compBiasCorrection);
-            Matcher matcher(querySeqType, maxSeqLen, m, &evaluer, compBiasCorrection, gapOpen, gapExtend);
+            Matcher matcher(querySeqType,
+                                (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) ? maxSeqLen : std::max(tdbr->getMaxSeqLen(), qdbr->getMaxSeqLen()),
+                                 m, &evaluer, compBiasCorrection, gapOpen, gapExtend, zdrop);
             Matcher *realigner = NULL;
             if (realign ==  true && wrappedScoring == false) {
-                realigner = new Matcher(querySeqType, maxSeqLen, realign_m, &evaluer, compBiasCorrection, gapOpen, gapExtend);
+                realigner = new Matcher(querySeqType,
+                                       (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) ? maxSeqLen : std::max(tdbr->getMaxSeqLen(), qdbr->getMaxSeqLen()),
+                                       realign_m, &evaluer, compBiasCorrection, gapOpen, gapExtend, zdrop);
             }
 
             std::vector<Matcher::result_t> swResults;
@@ -354,7 +355,7 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex,
                     // Prefilter result (need to make this better)
                     if(elements == 3){
                         hit_t hit = QueryMatcher::parsePrefilterHit(data);
-                        isReverse = (reversePrefilterResult) ?  (hit.prefScore < 0) ? true : false : false;
+                        isReverse = reversePrefilterResult && (hit.prefScore < 0);
                         diagonal = static_cast<short>(hit.diagonal);
                     }
                     size_t dbId = tdbr->getId(dbKey);
@@ -384,15 +385,8 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex,
                         res.seqId = 1.0f;
                     }
                     if(checkCriteria(res, isIdentity, evalThr, seqIdThr, alnLenThr, covMode, covThr)){
-                        if(wrappedScoring){
-                            hit_t hit;
-                            hit.seqId = res.dbKey;
-                            hit.prefScore = (isReverse?-100:100) * res.seqId;
-                            hit.diagonal = isReverse?res.qStartPos-res.dbEndPos:res.qStartPos-res.dbStartPos;
-                            shortResults.emplace_back(hit);
-                        }
-                        else
-                          swResults.emplace_back(res);
+
+                        swResults.emplace_back(res);
                         passedNum++;
                         totalPassedNum++;
                         rejected = 0;
@@ -519,7 +513,7 @@ bool Alignment::checkCriteria(Matcher::result_t &res, bool isIdentity, double ev
 void Alignment::computeAlternativeAlignment(unsigned int queryDbKey, Sequence &dbSeq,
                                             std::vector<Matcher::result_t> &swResults,
                                             Matcher &matcher, float evalThr, int swMode, int thread_idx) {
-    int xIndex = m->aa2int[static_cast<int>('X')];
+    unsigned char xIndex = m->aa2num[static_cast<int>('X')];
     size_t firstItResSize = swResults.size();
     for(size_t i = 0; i < firstItResSize; i++) {
         const bool isIdentity = (queryDbKey == swResults[i].dbKey && (includeIdentity || sameQTDB))
@@ -536,7 +530,7 @@ void Alignment::computeAlternativeAlignment(unsigned int queryDbKey, Sequence &d
 
         dbSeq.mapSequence(dbId, swResults[i].dbKey, dbSeqData, tdbr->getSeqLen(dbId));
         for (int pos = swResults[i].dbStartPos; pos < swResults[i].dbEndPos; ++pos) {
-            dbSeq.int_sequence[pos] = xIndex;
+            dbSeq.numSequence[pos] = xIndex;
         }
         bool nextAlignment = true;
         for (int altAli = 0; altAli < altAlignment && nextAlignment; altAli++) {
@@ -546,7 +540,7 @@ void Alignment::computeAlternativeAlignment(unsigned int queryDbKey, Sequence &d
             if (nextAlignment == true) {
                 swResults.emplace_back(res);
                 for (int pos = res.dbStartPos; pos < res.dbEndPos; pos++) {
-                    dbSeq.int_sequence[pos] = xIndex;
+                    dbSeq.numSequence[pos] = xIndex;
                 }
             }
         }
diff --git a/src/alignment/Alignment.h b/src/alignment/Alignment.h
index c597b12..2334731 100644
--- a/src/alignment/Alignment.h
+++ b/src/alignment/Alignment.h
@@ -37,6 +37,7 @@ class Alignment {
 
     static bool checkCriteria(Matcher::result_t &res, bool isIdentity, double evalThr, double seqIdThr, int alnLenThr, int covMode, float covThr);
 
+    static unsigned int initSWMode(unsigned int alignmentMode, float covThr, float seqIdThr);
 
 private:
     // sequence coverage threshold
@@ -95,6 +96,8 @@ class Alignment {
     int gapOpen;
     // costs to extend a gap
     int gapExtend;
+    // score difference to break alignment
+    int zdrop;
 
 
     // needed for realignment
@@ -110,8 +113,6 @@ class Alignment {
 
     bool reversePrefilterResult;
 
-    void initSWMode(unsigned int alignmentMode);
-
     static size_t estimateHDDMemoryConsumption(int dbSize, int maxSeqs);
 
     void computeAlternativeAlignment(unsigned int queryDbKey, Sequence &dbSeq,
diff --git a/src/alignment/BandedNucleotideAligner.cpp b/src/alignment/BandedNucleotideAligner.cpp
index 81a0720..6e617a8 100644
--- a/src/alignment/BandedNucleotideAligner.cpp
+++ b/src/alignment/BandedNucleotideAligner.cpp
@@ -15,17 +15,16 @@
 #include "StripedSmithWaterman.h"
 
 
-BandedNucleotideAligner::BandedNucleotideAligner(BaseMatrix * subMat, size_t maxSequenceLength, int gapo, int gape) :
+BandedNucleotideAligner::BandedNucleotideAligner(BaseMatrix * subMat, size_t maxSequenceLength, int gapo, int gape, int zdrop) :
 fastMatrix(SubstitutionMatrix::createAsciiSubMat(*subMat))
 {
-
-    targetSeq =  new uint8_t[maxSequenceLength + 1];
-    targetSeqRev =  new uint8_t[maxSequenceLength + 1];
-    querySeq =  new uint8_t[maxSequenceLength + 1];
-    querySeqRev =  new uint8_t[maxSequenceLength + 1];
-    queryRevCompSeq =  new uint8_t[maxSequenceLength + 1];
-    queryRevCompSeqRev =  new uint8_t[maxSequenceLength + 1];
-    queryRevCompCharSeq  =  new char[maxSequenceLength + 1];
+    targetSeqRevDataLen = maxSequenceLength;
+    targetSeqRev = static_cast<uint8_t*>(malloc(targetSeqRevDataLen + 1));
+    querySeqRevDataLen = maxSequenceLength;
+    querySeqRev = static_cast<uint8_t*>(malloc(querySeqRevDataLen + 1));
+    queryRevCompSeq =  static_cast<uint8_t*>(malloc(querySeqRevDataLen + 1));
+    queryRevCompSeqRev =  static_cast<uint8_t*>(malloc(querySeqRevDataLen  + 1));
+    queryRevCompCharSeq  =  static_cast<char*>(malloc(querySeqRevDataLen + 1));
     mat = new int8_t[subMat->alphabetSize*subMat->alphabetSize];
     this->subMat = (NucleotideMatrix*) subMat;
     for (int i = 0; i < subMat->alphabetSize; i++) {
@@ -35,16 +34,15 @@ fastMatrix(SubstitutionMatrix::createAsciiSubMat(*subMat))
     }
     this->gape = gape;
     this->gapo = gapo;
+    this->zdrop = zdrop;
 }
 
 BandedNucleotideAligner::~BandedNucleotideAligner(){
-    delete [] querySeq;
-    delete [] targetSeq;
-    delete [] targetSeqRev;
-    delete [] querySeqRev;
-    delete [] queryRevCompSeq;
-    delete [] queryRevCompSeqRev;
-    delete [] queryRevCompCharSeq;
+    free(targetSeqRev);
+    free(querySeqRev);
+    free(queryRevCompSeq);
+    free(queryRevCompSeqRev);
+    free(queryRevCompCharSeq);
     delete [] fastMatrix.matrixData;
     delete [] fastMatrix.matrix;
     delete [] mat;
@@ -52,16 +50,20 @@ BandedNucleotideAligner::~BandedNucleotideAligner(){
 
 void BandedNucleotideAligner::initQuery(Sequence * query){
     querySeqObj = query;
-    for (int i = 0; i < query->L; ++i) {
-        querySeq[i] = query->int_sequence[i];
+    querySeq = query->numSequence;
+    if(query->L >= querySeqRevDataLen){
+        querySeqRev = static_cast<uint8_t *>(realloc(querySeqRev, query->L+1));
+        queryRevCompSeq = static_cast<uint8_t *>(realloc(queryRevCompSeq, query->L+1));
+        queryRevCompCharSeq = static_cast<char *>(realloc(queryRevCompCharSeq, query->L+1));
+        queryRevCompSeqRev = static_cast<uint8_t *>(realloc(queryRevCompSeqRev, query->L+1));
+        querySeqRevDataLen=query->L;
     }
     SmithWaterman::seq_reverse((int8_t *)querySeqRev, (int8_t *)querySeq, query->L);
     // needed for rev. complement
     for (int pos = query->L - 1; pos > -1; pos--) {
-        int res = query->int_sequence[pos];
+        int res = query->numSequence[pos];
         queryRevCompSeq[(query->L - 1) - pos] = subMat->reverseResidue(res);
-        queryRevCompCharSeq[(query->L - 1) - pos] = subMat->int2aa[subMat->reverseResidue(res)];
-
+        queryRevCompCharSeq[(query->L - 1) - pos] = subMat->num2aa[subMat->reverseResidue(res)];
     }
     SmithWaterman::seq_reverse((int8_t *)queryRevCompSeqRev, (int8_t *)queryRevCompSeq, query->L);
 }
@@ -82,8 +84,11 @@ s_align BandedNucleotideAligner::align(Sequence * targetSeqObj,
         querySeqAlign     = queryRevCompSeq;
     }
 
-    for (int i = 0; i < targetSeqObj->L; ++i) {
-        targetSeq[i] = targetSeqObj->int_sequence[i];
+
+    const unsigned char * targetSeq = targetSeqObj->numSequence;
+    if(targetSeqObj->L >= targetSeqRevDataLen){
+        targetSeqRev = static_cast<uint8_t *>(realloc(targetSeqRev, targetSeqObj->L+1));
+        targetSeqRevDataLen=targetSeqObj->L;
     }
     SmithWaterman::seq_reverse((int8_t *)targetSeqRev, (int8_t *)targetSeq, targetSeqObj->L);
 
@@ -138,7 +143,7 @@ s_align BandedNucleotideAligner::align(Sequence * targetSeqObj,
         result.tCov = SmithWaterman::computeCov(result.dbStartPos1, result.dbEndPos1, targetSeqObj->L);
         result.evalue = evaluer->computeEvalue(result.score1, origQueryLen);
         for (int i = qUngappedStartPos; i <= qUngappedEndPos; i++) {
-            aaIds += (querySeqAlign[i] == targetSeq[dbUngappedStartPos + (i - dbUngappedStartPos)]) ? 1 : 0;
+            aaIds += (querySeqAlign[i] == targetSeq[dbUngappedStartPos + (i - qUngappedStartPos)]) ? 1 : 0;
         }
         for(int pos = 0; pos <  origQueryLen; pos++){
             backtrace.append("M");
@@ -157,10 +162,11 @@ s_align BandedNucleotideAligner::align(Sequence * targetSeqObj,
     flag |= KSW_EZ_EXTZ_ONLY;
 
     int queryRevLenToAlign = querySeqObj->L - qStartRev;
-    if (wrappedScoring && queryRevLenToAlign > origQueryLen)
+    if (wrappedScoring && queryRevLenToAlign > origQueryLen){
         queryRevLenToAlign = origQueryLen;
+    }
 
-    ksw_extz2_sse(0, queryRevLenToAlign, querySeqRevAlign + qStartRev, targetSeqObj->L - tStartRev, targetSeqRev + tStartRev, 5, mat, gapo, gape, 64, 40, flag, &ez);
+    ksw_extz2_sse(0, queryRevLenToAlign, querySeqRevAlign + qStartRev, targetSeqObj->L - tStartRev, targetSeqRev + tStartRev, 5, mat, gapo, gape, 64, zdrop, flag, &ez);
 
     int qStartPos = querySeqObj->L  - ( qStartRev + ez.max_q ) -1;
     int tStartPos = targetSeqObj->L - ( tStartRev + ez.max_t ) -1;
@@ -177,7 +183,7 @@ s_align BandedNucleotideAligner::align(Sequence * targetSeqObj,
     if (wrappedScoring && queryLenToAlign > origQueryLen)
         queryLenToAlign = origQueryLen;
     ksw_extz2_sse(0, queryLenToAlign, querySeqAlign+qStartPos, targetSeqObj->L-tStartPos, targetSeq+tStartPos, 5,
-                  mat, gapo, gape, 64, 40, alignFlag, &ezAlign);
+                  mat, gapo, gape, 64, zdrop, alignFlag, &ezAlign);
 
     std::string letterCode = "MID";
     uint32_t * retCigar;
@@ -185,7 +191,7 @@ s_align BandedNucleotideAligner::align(Sequence * targetSeqObj,
     if (ez.max_q > ezAlign.max_q && ez.max_t > ezAlign.max_t){
 
         ksw_extz2_sse(0, queryRevLenToAlign, querySeqRevAlign + qStartRev, targetSeqObj->L - tStartRev,
-                      targetSeqRev + tStartRev, 5, mat, gapo, gape, 64, 40, alignFlag, &ezAlign);
+                      targetSeqRev + tStartRev, 5, mat, gapo, gape, 64, zdrop, alignFlag, &ezAlign);
 
         retCigar = new uint32_t[ezAlign.n_cigar];
         for(int i = 0; i < ezAlign.n_cigar; i++){
@@ -208,8 +214,9 @@ s_align BandedNucleotideAligner::align(Sequence * targetSeqObj,
     result.dbEndPos1 = tStartPos+ezAlign.max_t;
     result.dbStartPos1 = tStartPos;
     result.qCov = SmithWaterman::computeCov(result.qStartPos1, result.qEndPos1, querySeqObj->L);
-    if(wrappedScoring)
+    if(wrappedScoring) {
         result.qCov = std::min(1.0f, result.qCov*2);
+    }
     result.tCov = SmithWaterman::computeCov(result.dbStartPos1, result.dbEndPos1, targetSeqObj->L);
     result.evalue = evaluer->computeEvalue(result.score1, origQueryLen);
     if(result.cigar){
diff --git a/src/alignment/BandedNucleotideAligner.h b/src/alignment/BandedNucleotideAligner.h
index 364bbb1..eea7832 100644
--- a/src/alignment/BandedNucleotideAligner.h
+++ b/src/alignment/BandedNucleotideAligner.h
@@ -17,7 +17,7 @@ class BandedNucleotideAligner {
 public:
 
 
-    BandedNucleotideAligner(BaseMatrix *subMat, size_t maxSequenceLength, int gapo, int gape);
+    BandedNucleotideAligner(BaseMatrix *subMat, size_t maxSequenceLength, int gapo, int gape, int zdrop);
 
     ~BandedNucleotideAligner();
 
@@ -28,10 +28,11 @@ class BandedNucleotideAligner {
 
 private:
     SubstitutionMatrix::FastMatrix fastMatrix;
-    uint8_t * targetSeq;
     uint8_t * targetSeqRev;
+    int targetSeqRevDataLen;
     uint8_t * querySeq;
     uint8_t * querySeqRev;
+    int querySeqRevDataLen;
     uint8_t * queryRevCompSeq;
     char * queryRevCompCharSeq;
     uint8_t * queryRevCompSeqRev;
@@ -41,4 +42,5 @@ class BandedNucleotideAligner {
 //    uint32_t * cigar;
     int gapo;
     int gape;
+    int zdrop;
 };
diff --git a/src/alignment/Main.cpp b/src/alignment/Main.cpp
index 7a3094a..ecf033a 100644
--- a/src/alignment/Main.cpp
+++ b/src/alignment/Main.cpp
@@ -13,6 +13,7 @@ int align(int argc, const char **argv, const Command& command) {
     MMseqsMPI::init(argc, argv);
 
     Parameters& par = Parameters::getInstance();
+    par.overrideParameterDescription(par.PARAM_ALIGNMENT_MODE, "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id", NULL, 0);
     par.parseParameters(argc, argv, command, true, 0, MMseqsParameter::COMMAND_ALIGN);
 
     Alignment aln(par.db1, par.db2,
diff --git a/src/alignment/Matcher.cpp b/src/alignment/Matcher.cpp
index 59677df..40dd463 100644
--- a/src/alignment/Matcher.cpp
+++ b/src/alignment/Matcher.cpp
@@ -7,14 +7,14 @@
 
 
 Matcher::Matcher(int querySeqType, int maxSeqLen, BaseMatrix *m, EvalueComputation * evaluer,
-                 bool aaBiasCorrection, int gapOpen, int gapExtend)
+                 bool aaBiasCorrection, int gapOpen, int gapExtend, int zdrop)
                  : gapOpen(gapOpen), gapExtend(gapExtend), m(m), evaluer(evaluer), tinySubMat(NULL) {
     if(Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_PROFILE_STATE_PROFILE) == false ) {
         setSubstitutionMatrix(m);
     }
 
     if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) {
-        nuclaligner = new BandedNucleotideAligner(m, maxSeqLen, gapOpen, gapExtend);
+        nuclaligner = new BandedNucleotideAligner(m, maxSeqLen, gapOpen, gapExtend, zdrop);
         aligner = NULL;
     } else {
         nuclaligner = NULL;
@@ -51,9 +51,9 @@ void Matcher::initQuery(Sequence* query){
     if(Parameters::isEqualDbtype(query->getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES)){
         nuclaligner->initQuery(query);
     }else if(Parameters::isEqualDbtype(query->getSeqType(), Parameters::DBTYPE_HMM_PROFILE) || Parameters::isEqualDbtype(query->getSeqType(), Parameters::DBTYPE_PROFILE_STATE_PROFILE)){
-        aligner->ssw_init(query, query->getAlignmentProfile(), this->m, this->m->alphabetSize, 2);
+        aligner->ssw_init(query, query->getAlignmentProfile(), this->m, 2);
     }else{
-        aligner->ssw_init(query, this->tinySubMat, this->m, this->m->alphabetSize, 2);
+        aligner->ssw_init(query, this->tinySubMat, this->m, 2);
     }
 }
 
@@ -88,9 +88,9 @@ Matcher::result_t Matcher::getSWResult(Sequence* dbSeq, const int diagonal, bool
         alignment = nuclaligner->align(dbSeq, diagonal, isReverse, backtrace, aaIds, evaluer, wrappedScoring);
         alignmentMode = Matcher::SCORE_COV_SEQID;
     }else{ if(isIdentity==false){
-            alignment = aligner->ssw_align(dbSeq->int_sequence, dbSeq->L, gapOpen, gapExtend, alignmentMode, evalThr, evaluer, covMode, covThr, maskLen);
+            alignment = aligner->ssw_align(dbSeq->numSequence, dbSeq->L, gapOpen, gapExtend, alignmentMode, evalThr, evaluer, covMode, covThr, maskLen);
         }else{
-            alignment = aligner->scoreIdentical(dbSeq->int_sequence, dbSeq->L, evaluer, alignmentMode);
+            alignment = aligner->scoreIdentical(dbSeq->numSequence, dbSeq->L, evaluer, alignmentMode);
         }
         if(alignmentMode == Matcher::SCORE_COV_SEQID){
             if(isIdentity==false){
@@ -103,7 +103,7 @@ Matcher::result_t Matcher::getSWResult(Sequence* dbSeq, const int diagonal, bool
 
                         for (uint32_t i = 0; i < length; ++i){
                             if (letter == 'M') {
-                                if (dbSeq->int_sequence[targetPos] == currentQuery->int_sequence[queryPos]){
+                                if (dbSeq->numSequence[targetPos] == currentQuery->numSequence[queryPos]){
                                     aaIds++;
                                 }
                                 ++queryPos;
@@ -302,41 +302,8 @@ size_t Matcher::resultToBuffer(char * buff1, const result_t &result, bool addBac
     *(tmpBuff-1) = '\t';
     tmpBuff = Itoa::i32toa_sse2(result.score, tmpBuff);
     *(tmpBuff-1) = '\t';
-    float seqIdFlt = result.seqId;
-    //TODO seqid, evalue
-
-
-    if(seqIdFlt==1.0){
-        *(tmpBuff) = '1';
-        tmpBuff++;
-        *(tmpBuff) = '.';
-        tmpBuff++;
-        *(tmpBuff) = '0';
-        tmpBuff++;
-        *(tmpBuff) = '0';
-        tmpBuff++;
-        *(tmpBuff) = '0';
-        tmpBuff++;
-        *(tmpBuff) = '\t';
-        tmpBuff++;
-    }else{
-        *(tmpBuff) = '0';
-        tmpBuff++;
-        *(tmpBuff) = '.';
-        tmpBuff++;
-        if(seqIdFlt<0.10){
-            *(tmpBuff) = '0';
-            tmpBuff++;
-        }
-        if(seqIdFlt<0.01){
-            *(tmpBuff) = '0';
-            tmpBuff++;
-        }
-        int seqId = seqIdFlt*1000;
-        tmpBuff = Itoa::i32toa_sse2(seqId, tmpBuff);
-        *(tmpBuff-1) = '\t';
-    }
-
+    tmpBuff = Util::fastSeqIdToBuffer(result.seqId, tmpBuff);
+    *(tmpBuff-1) = '\t';
     tmpBuff += sprintf(tmpBuff,"%.3E",result.eval);
     tmpBuff++;
     *(tmpBuff-1) = '\t';
diff --git a/src/alignment/Matcher.h b/src/alignment/Matcher.h
index fbeb2ca..be203f9 100644
--- a/src/alignment/Matcher.h
+++ b/src/alignment/Matcher.h
@@ -58,14 +58,7 @@ class Matcher{
                                           qStartPos(qStartPos), qEndPos(qEndPos), qLen(qLen),
                                           dbStartPos(dbStartPos), dbEndPos(dbEndPos), dbLen(dbLen),
                                           backtrace(backtrace) {};
-
-
-        result_t(const result_t &res) :  dbKey(res.dbKey), score(res.score), qcov(res.qcov),
-        dbcov(res.dbcov), seqId(res.seqId), eval(res.eval), alnLength(res.alnLength),
-        qStartPos(res.qStartPos), qEndPos(res.qEndPos), qLen(res.qLen),
-        dbStartPos(res.dbStartPos), dbEndPos(res.dbEndPos), dbLen(res.dbLen),
-        backtrace(res.backtrace) {} ;
-
+        
         result_t(){};
 
         static void swapResult(result_t & res, EvalueComputation &evaluer, bool hasBacktrace){
@@ -122,7 +115,7 @@ class Matcher{
 
     Matcher(int querySeqType, int maxSeqLen, BaseMatrix *m,
             EvalueComputation * evaluer, bool aaBiasCorrection,
-            int gapOpen, int gapExtend);
+            int gapOpen, int gapExtend, int zdrop = 40);
 
     ~Matcher();
 
@@ -131,71 +124,52 @@ class Matcher{
                          unsigned int alignmentMode, unsigned int seqIdMode, bool isIdentical, bool wrappedScoring=false);
 
     // need for sorting the results
-    static bool compareHits (const result_t &first, const result_t &second){
-        //return (first.eval < second.eval);
-        if(first.eval < second.eval )
-            return true;
-        if(second.eval < first.eval )
-            return false;
-        if(first.score > second.score )
-            return true;
-        if(second.score > first.score )
-            return false;
-        if(first.dbLen < second.dbLen )
-            return true;
-        if(second.dbLen < first.dbLen )
-            return false;
-        if(first.dbKey < second.dbKey )
-            return true;
-        if(second.dbKey < first.dbKey )
-            return false;
-        return false;
+    static bool compareHits(const result_t &first, const result_t &second) {
+        if (first.eval != second.eval) {
+            return first.eval < second.eval;
+        }
+        if (first.score != second.score) {
+            return first.score > second.score;
+        }
+        if (first.dbLen != second.dbLen) {
+            return first.dbLen < second.dbLen;
+        }
+        return first.dbKey < second.dbKey;
     }
-    static bool compareHitByPos(const result_t &first, const result_t &second){
-
-        int firstQStartPos  = std::min( first.qStartPos, first.qEndPos);
-        int secondQStartPos = std::min( second.qStartPos, second.qEndPos);
-        if(secondQStartPos < firstQStartPos )
-            return false;
-        if(firstQStartPos < secondQStartPos)
-            return true;
-        return false;
 
+    static bool compareHitByPos(const result_t &first, const result_t &second) {
+        int firstQStartPos  = std::min(first.qStartPos, first.qEndPos);
+        int secondQStartPos = std::min(second.qStartPos, second.qEndPos);
+        return firstQStartPos < secondQStartPos;
     }
+
     // need for sorting the results
-    static bool compareHitsByPosAndStrand (const result_t &first, const result_t &second){
-        //return (first.eval < second.eval);
-        if(second.dbKey < first.dbKey)
-            return false;
-        if(first.dbKey < second.dbKey)
-            return true;
+    static bool compareHitsByPosAndStrand(const result_t &first, const result_t &second) {
+        if (first.dbKey != second.dbKey) {
+            return first.dbKey < second.dbKey;
+        }
         bool qFirstRev = (first.qStartPos > first.qEndPos);
         bool qSecondRev = (second.qStartPos > second.qEndPos);
-        if(qSecondRev < qFirstRev)
+        if (qSecondRev < qFirstRev)
             return false;
-        if(qFirstRev < qSecondRev)
+        if (qFirstRev < qSecondRev)
             return true;
         bool dbFirstRev = (first.dbStartPos > first.dbEndPos);
         bool dbSecondRev = (second.dbStartPos > second.dbEndPos);
-        if(dbSecondRev < dbFirstRev)
+        if (dbSecondRev < dbFirstRev)
             return false;
-        if(dbFirstRev < dbSecondRev)
+        if (dbFirstRev < dbSecondRev)
             return true;
-        int firstQStartPos  = std::min( first.qStartPos, first.qEndPos);
-        int secondQStartPos = std::min( second.qStartPos, second.qEndPos);
-        int firstDbStart    = std::min( first.dbStartPos, first.dbEndPos);
-        int secondDbStart   = std::min( second.dbStartPos, second.dbEndPos);
+        int firstQStartPos  = std::min(first.qStartPos, first.qEndPos);
+        int secondQStartPos = std::min(second.qStartPos, second.qEndPos);
+        int firstDbStart    = std::min(first.dbStartPos, first.dbEndPos);
+        int secondDbStart   = std::min(second.dbStartPos, second.dbEndPos);
         int firstDiagonal  = firstQStartPos - firstDbStart;
         int secondDiagonal = secondQStartPos - secondDbStart;
-        if(secondDiagonal < firstDiagonal)
-            return false;
-        if(firstDiagonal < secondDiagonal)
-            return true;
-        if(secondDbStart < firstDbStart )
-            return false;
-        if(firstDbStart < secondDbStart)
-            return true;
-        return false;
+        if (firstDiagonal != secondDiagonal) {
+            return firstDiagonal < secondDiagonal;
+        }
+        return firstDbStart < secondDbStart;
     }
 
     // map new query into memory (create queryProfile, ...)
diff --git a/src/alignment/MsaFilter.cpp b/src/alignment/MsaFilter.cpp
index 33dd0c8..d99fb68 100644
--- a/src/alignment/MsaFilter.cpp
+++ b/src/alignment/MsaFilter.cpp
@@ -44,9 +44,25 @@ MsaFilter::~MsaFilter() {
     delete [] display;
 }
 
-void MsaFilter::filter(const int N_in, const int L, const int coverage, const int qid,
-                       const float qsc, const int max_seqid, int Ndiff,
-                       const char ** X, size_t *N_out) {
+size_t MsaFilter::filter(MultipleAlignment::MSAResult &msa, int coverage, int qid, float qsc, int max_seqid, int Ndiff) {
+    size_t filteredSize = filter(msa.setSize, msa.centerLength, coverage, qid, qsc, max_seqid, Ndiff, (const char **) msa.msaSequence);
+    if (!msa.alignmentResults.empty()) {
+        // alignmentResults does not include the query
+        for (size_t i = 0, j = 0; j < msa.setSize - 1; j++) {
+            if (keep[j] != 0) {
+                if (i < j) {
+                    std::swap(msa.alignmentResults[i], msa.alignmentResults[j]);
+                }
+                i++;
+            }
+        }
+        msa.alignmentResults.resize(filteredSize - 1);
+    }
+    return filteredSize;
+}
+
+size_t MsaFilter::filter(const int N_in, const int L, const int coverage, const int qid,
+                       const float qsc, const int max_seqid, int Ndiff, const char **X) {
     int seqid1 = 20;
     // X[k][i] contains column i of sequence k in alignment (first seq=0, first char=1) (0-3: ARND ..., 20:X, 21:GAP)
 //    char** X = (char **) &msaSequence;
@@ -258,8 +274,8 @@ void MsaFilter::filter(const int N_in, const int L, const int coverage, const in
 
     // If min required seqid larger than max required seqid, return here without doing pairwise seqid filtering
     if (seqid1 > max_seqid) {
-        *N_out = nn;
-        return;
+        shuffleSequences(X, N_in);
+        return nn;
     }
 
     // Successively increment idmax[i] at positons where N[i]<Ndiff
@@ -395,7 +411,7 @@ void MsaFilter::filter(const int N_in, const int L, const int coverage, const in
 //            // DEBUG
 //            printf("%20.20s with %20.20s:  diff=%i  diff_min_frac*cov_kj=%f  diff_suff=%i  nres=%i  cov_kj=%i\n",sname[k],sname[j],diff,diff_min_frac*cov_kj,diff_suff,nres[k],cov_kj);
 //            printf("%s\n%s\n\n",seq[k],seq[j]);
-                if (diff < diff_suff && float(diff) <= diff_min_frac * cov_kj)
+                if (diff < diff_suff && float(diff) <= diff_min_frac * cov_kj && cov_kj > 0)
                     break;  //dissimilarity < acceptace threshold? Reject!
 
             }
@@ -450,7 +466,8 @@ void MsaFilter::filter(const int N_in, const int L, const int coverage, const in
         keep[k] = in[k];
     }
 
-    *N_out = n;
+    shuffleSequences(X, N_in);
+    return n;
 }
 
 void MsaFilter::shuffleSequences(const char ** X, size_t setSize) {
diff --git a/src/alignment/MsaFilter.h b/src/alignment/MsaFilter.h
index 25c5919..67aecc9 100644
--- a/src/alignment/MsaFilter.h
+++ b/src/alignment/MsaFilter.h
@@ -34,12 +34,9 @@ class MsaFilter {
     // Example: two sequences x and y are 100% identical in their overlapping region but one overlaps by 10% of its
     // length on the left and the other by 20% on the right. Then x has 10% seq.id with y and y has 20% seq.id. with x.
     /////////////////////////////////////////////////////////////////////////////////////
-    void filter(int N_in, int L, int coverage, int qid,
-                float qsc, int max_seqid, int Ndiff,
-                const char ** X, size_t *N_out);
-
-    // shuffles the filtered sequences to the back of the array, the unfiltered ones remain in the front
-    void shuffleSequences(const char ** X, size_t setSize);
+    size_t filter(MultipleAlignment::MSAResult& msa, int coverage, int qid, float qsc, int max_seqid, int Ndiff);
+    size_t filter(const int N_in, const int L, const int coverage, const int qid,
+                  const float qsc, const int max_seqid, int Ndiff, const char **X);
 
     void getKept(bool *offsets, size_t setSize);
 
@@ -50,6 +47,9 @@ class MsaFilter {
 	
 	
 private:
+    // shuffles the filtered sequences to the back of the array, the unfiltered ones remain in the front
+    void shuffleSequences(const char ** X, size_t setSize);
+
     // prune sequence based on score
     int prune(int start, int end, float b, char * query, char *target);
 
diff --git a/src/alignment/MultipleAlignment.cpp b/src/alignment/MultipleAlignment.cpp
index da53942..80133c9 100644
--- a/src/alignment/MultipleAlignment.cpp
+++ b/src/alignment/MultipleAlignment.cpp
@@ -41,7 +41,7 @@ void MultipleAlignment::print(MSAResult msaResult, SubstitutionMatrix * subMat){
     for(size_t i = 0; i < msaResult.setSize; i++) {
         for(size_t pos = 0; pos < msaResult.msaSequenceLength; pos++){
             char aa = msaResult.msaSequence[i][pos];
-            printf("%c", (aa < NAA) ? subMat->int2aa[(int)aa] : '-' );
+            printf("%c", (aa < NAA) ? subMat->num2aa[(int)aa] : '-' );
         }
         printf("\n");
     }
@@ -110,7 +110,7 @@ size_t MultipleAlignment::updateGapsInCenterSequence(char **msaSequence, Sequenc
                 centerSeqPos++;
             }
         }
-        msaSequence[0][centerSeqPos] = subMat->int2aa[centerSeq->int_sequence[queryPos]];
+        msaSequence[0][centerSeqPos] = subMat->num2aa[centerSeq->numSequence[queryPos]];
         centerSeqPos++;
     }
     return centerSeqPos;
@@ -155,7 +155,7 @@ void MultipleAlignment::updateGapsInSequenceSet(char **msaSequence, size_t cente
                 if(bt.at(alnPos) == 'D'){
                     while(bt.at(alnPos) == 'D' &&  alnPos < bt.size() ){
                         if(noDeletionMSA == false) {
-                            edgeSeqMSA[bufferPos] = subMat->int2aa[edgeSeq->int_sequence[targetPos]];
+                            edgeSeqMSA[bufferPos] = subMat->num2aa[edgeSeq->numSequence[targetPos]];
                             bufferPos++;
                         }
                         targetPos++;
@@ -168,7 +168,7 @@ void MultipleAlignment::updateGapsInSequenceSet(char **msaSequence, size_t cente
                         bufferPos++;
                         queryPos++;
                     } else if(bt.at(alnPos) == 'M'){
-                        edgeSeqMSA[bufferPos] = subMat->int2aa[edgeSeq->int_sequence[targetPos]];
+                        edgeSeqMSA[bufferPos] = subMat->num2aa[edgeSeq->numSequence[targetPos]];
                         bufferPos++;
                         queryPos++;
                         targetPos++;
@@ -184,7 +184,7 @@ void MultipleAlignment::updateGapsInSequenceSet(char **msaSequence, size_t cente
                         }
                     }
                     // M state
-                    edgeSeqMSA[bufferPos] = subMat->int2aa[edgeSeq->int_sequence[targetPos]];
+                    edgeSeqMSA[bufferPos] = subMat->num2aa[edgeSeq->numSequence[targetPos]];
 
                     bufferPos++;
                     queryPos++;
@@ -203,7 +203,7 @@ void MultipleAlignment::updateGapsInSequenceSet(char **msaSequence, size_t cente
 
 MultipleAlignment::MSAResult MultipleAlignment::computeMSA(Sequence *centerSeq, const std::vector<Sequence *>& edgeSeqs, bool noDeletionMSA) {
     // just center sequence is included
-    if(edgeSeqs.size() == 0 ){
+    if (edgeSeqs.empty()) {
         return singleSequenceMSA(centerSeq);
     }
 
@@ -218,22 +218,21 @@ MultipleAlignment::MSAResult MultipleAlignment::computeMSA(Sequence *centerSeq,
 
 MultipleAlignment::MSAResult MultipleAlignment::computeMSA(Sequence *centerSeq, const std::vector<Sequence *>& edgeSeqs,
                                                            const std::vector<Matcher::result_t>& alignmentResults, bool noDeletionMSA) {
-    if(edgeSeqs.size() == 0 ){
+    if (edgeSeqs.empty()) {
         return singleSequenceMSA(centerSeq);
     }
 
+    if (edgeSeqs.size() != alignmentResults.size()) {
+        Debug(Debug::ERROR) << "edgeSeqs.size (" << edgeSeqs.size() << ") is != alignmentResults.size (" << alignmentResults.size() << ")" << "\n";
+        EXIT(EXIT_FAILURE);
+    }
+
     char ** msaSequence = new char *[edgeSeqs.size() + 1];
     for(size_t i = 0; i <= edgeSeqs.size(); i++){
         // FIXME: in deletion case, the msa could become even larger than maxSeqLen
         msaSequence[i] = initX(noDeletionMSA ? centerSeq->L + 1: maxSeqLen + 1);
     }
 
-    if(edgeSeqs.size() != alignmentResults.size()){
-        Debug(Debug::ERROR) << "edgeSeqs.size (" << edgeSeqs.size() << ") is != alignmentResults.size (" << alignmentResults.size() << ")" << "\n";
-        EXIT(EXIT_FAILURE);
-    }
-	
-	
     computeQueryGaps(queryGaps, centerSeq, edgeSeqs, alignmentResults);
     // process gaps in Query (update sequences)
     // and write query Alignment at position 0
@@ -247,9 +246,9 @@ MultipleAlignment::MSAResult MultipleAlignment::computeMSA(Sequence *centerSeq,
     //alignmentResults.clear();
     // map to int
     for (size_t k = 0; k < edgeSeqs.size() + 1; ++k) {
-        for (unsigned int pos = 0; pos < centerSeqSize; ++pos) {
+        for (size_t pos = 0; pos < centerSeqSize; ++pos) {
             msaSequence[k][pos] = (msaSequence[k][pos] == '-') ?
-                                  GAP : subMat->aa2int[(int) msaSequence[k][pos]];
+                                  GAP : static_cast<int>(subMat->aa2num[static_cast<int>(msaSequence[k][pos])]);
         }
         int len = std::min(maxMsaSeqLen, (centerSeqSize + VECSIZE_INT*4));
         int startPos = std::min(centerSeqSize, maxMsaSeqLen - 1);
@@ -272,7 +271,7 @@ MultipleAlignment::MSAResult MultipleAlignment::singleSequenceMSA(Sequence *cent
             Debug(Debug::ERROR) << "queryMSASize (" << queryMSASize << ") is >= maxMsaSeqLen (" << maxMsaSeqLen << ")" << "\n";
             EXIT(EXIT_FAILURE);
         }
-        msaSequence[0][queryMSASize] = (char) centerSeq->int_sequence[queryPos];
+        msaSequence[0][queryMSASize] = (char) centerSeq->numSequence[queryPos];
         queryMSASize++;
     }
     return MSAResult(queryMSASize, centerSeq->L, 1, msaSequence);
diff --git a/src/alignment/MultipleAlignment.h b/src/alignment/MultipleAlignment.h
index 9c7d3ca..52667d0 100644
--- a/src/alignment/MultipleAlignment.h
+++ b/src/alignment/MultipleAlignment.h
@@ -38,15 +38,17 @@ class MultipleAlignment {
     MultipleAlignment(size_t maxSeqLen, size_t maxSetSize, SubstitutionMatrix *subMat, Matcher *aligner);
 
     ~MultipleAlignment();
+
     // Compute center star multiple alignment from sequence input
-    MultipleAlignment::MSAResult computeMSA(Sequence *centerSeq, const std::vector<Sequence *> &edgeSeqs, bool noDeletionMSA);
+    MSAResult computeMSA(Sequence *centerSeq, const std::vector<Sequence *> &edgeSeqs, bool noDeletionMSA);
+
+    MSAResult computeMSA(Sequence *centerSeq, const std::vector<Sequence *> &edgeSeqs, const std::vector<Matcher::result_t> &alignmentResults, bool noDeletionMSA);
+
     static void print(MSAResult msaResult, SubstitutionMatrix * subMat);
 
     // init aligned memory for the MSA
     static char *initX(int len);
 
-    MSAResult computeMSA(Sequence *pSequence, const std::vector<Sequence *> &vector, const std::vector<Matcher::result_t> &vector1, bool i);
-
     // clean memory for MSA
     static void deleteMSA(MultipleAlignment::MSAResult * res);
 	
@@ -60,15 +62,15 @@ class MultipleAlignment {
     size_t maxMsaSeqLen;
     unsigned int * queryGaps;
 
-    std::vector<Matcher::result_t> computeBacktrace(Sequence *center, const std::vector<Sequence *>& sequences);
+    std::vector<Matcher::result_t> computeBacktrace(Sequence *centerSeq, const std::vector<Sequence *> &sequences);
 
-    void computeQueryGaps(unsigned int *queryGaps, Sequence *center, const std::vector<Sequence *>& seqs, const std::vector<Matcher::result_t>& alignmentResults);
+    void computeQueryGaps(unsigned int *queryGaps, Sequence *centerSeq, const std::vector<Sequence *> &seqs, const std::vector<Matcher::result_t> &alignmentResults);
 
     size_t updateGapsInCenterSequence(char **msaSequence, Sequence *centerSeq, bool noDeletionMSA);
 
-    void updateGapsInSequenceSet(char **centerSeqSize, size_t seqs, const std::vector<Sequence *> &vector,
-                                                    const std::vector<Matcher::result_t> &queryGaps, unsigned int *noDeletionMSA,
-                                                    bool b);
+    void updateGapsInSequenceSet(char **msaSequence, size_t centerSeqSize, const std::vector<Sequence *> &seqs,
+                                 const std::vector<Matcher::result_t> &alignmentResults, unsigned int *queryGaps,
+                                 bool noDeletionMSA);
 
     MSAResult singleSequenceMSA(Sequence *centerSeq);
 	
diff --git a/src/alignment/PSSMCalculator.cpp b/src/alignment/PSSMCalculator.cpp
index 577ae73..7962c6e 100644
--- a/src/alignment/PSSMCalculator.cpp
+++ b/src/alignment/PSSMCalculator.cpp
@@ -66,11 +66,11 @@ PSSMCalculator::Profile PSSMCalculator::computePSSMFromMSA(size_t setSize,
         computeNeff_M(matchWeight, seqWeight, Neff_M, queryLength, setSize, msaSeqs);
     }
     // compute consensus sequence
-    std::string consensusSequence = computeConsensusSequence(matchWeight, queryLength, subMat->pBack, subMat->int2aa);
+    std::string consensusSequence = computeConsensusSequence(matchWeight, queryLength, subMat->pBack, subMat->num2aa);
     if(pca > 0.0){
         // add pseudocounts (compute the scalar product between matchWeight and substitution matrix with pseudo counts)
         preparePseudoCounts(matchWeight, pseudocountsWeight, Sequence::PROFILE_AA_SIZE, queryLength, (const float **) subMat->subMatrixPseudoCounts);
-        //    SubstitutionMatrix::print(subMat->subMatrixPseudoCounts, subMat->int2aa, 20 );
+        //    SubstitutionMatrix::print(subMat->subMatrixPseudoCounts, subMat->num2aa, 20 );
         computePseudoCounts(profile, matchWeight, pseudocountsWeight, Sequence::PROFILE_AA_SIZE, Neff_M, queryLength, pca, pcb);
     }else{
         for (size_t pos = 0; pos < queryLength; pos++) {
@@ -87,16 +87,16 @@ PSSMCalculator::Profile PSSMCalculator::computePSSMFromMSA(size_t setSize,
     return Profile(pssm, profile, Neff_M, consensusSequence);
 }
 
-void PSSMCalculator::printProfile(size_t queryLength){
-    printf("Pos ");
-    for(size_t aa = 0; aa < Sequence::PROFILE_AA_SIZE; aa++) {
-        printf("%2c    ", subMat->int2aa[aa]);
+void PSSMCalculator::printProfile(size_t queryLength) {
+    printf("Pos");
+    for (size_t aa = 0; aa < Sequence::PROFILE_AA_SIZE; aa++) {
+        printf(" %6c", subMat->num2aa[aa]);
     }
     printf("\n");
-    for(size_t i = 0; i < queryLength; i++){
-        printf("%3zu ", i);
-        for(size_t aa = 0; aa < Sequence::PROFILE_AA_SIZE; aa++) {
-            printf("%03.4f ", profile[i * Sequence::PROFILE_AA_SIZE + aa] );
+    for (size_t i = 0; i < queryLength; i++) {
+        printf("%3zu", i);
+        for (size_t aa = 0; aa < Sequence::PROFILE_AA_SIZE; aa++) {
+            printf(" %.4f", profile[i * Sequence::PROFILE_AA_SIZE + aa]);
         }
         printf("\n");
     }
@@ -105,7 +105,7 @@ void PSSMCalculator::printProfile(size_t queryLength){
 void PSSMCalculator::printPSSM(size_t queryLength){
     printf("Pos ");
     for(size_t aa = 0; aa < Sequence::PROFILE_AA_SIZE; aa++) {
-        printf("%3c ", subMat->int2aa[aa]);
+        printf("%3c ", subMat->num2aa[aa]);
     }
     printf("\n");
     for(size_t i = 0; i <  queryLength; i++) {
@@ -218,9 +218,7 @@ void PSSMCalculator::computeSequenceWeights(float *seqWeight, size_t queryLength
         // "Position-based Sequence Weights", Henikoff (1994)
         for (size_t k = 0; k < setSize; ++k) {
             if (msaSeqs[k][pos] != MultipleAlignment::GAP) {
-                if(distinct_aa_count == 0){
-                    seqWeight[k] += 0.0;
-                } else {
+                if (distinct_aa_count != 0) {
                     const unsigned int aa_pos = msaSeqs[k][pos];
 //                    std::cout << "k="<< k << "\t";
                     if(aa_pos < Sequence::PROFILE_AA_SIZE){ // Treat score of X with other amino acid as 0.0
@@ -308,20 +306,20 @@ void PSSMCalculator::computeContextSpecificWeights(float * matchWeight, float *w
     // Main loop through alignment columns
     for (size_t i = 0; i < queryLength; i++)  // Calculate wi[k] at position i as well as Neff[i]
     {
-        bool change = 0;
+        bool change = false;
         // Check all sequences k and update n[j][a] and ri[j] if necessary
         for (size_t k = 0; k < setSize; ++k) {
             // Update amino acid and GAP / ENDGAP counts for sequences with AA in i-1 and GAP/ENDGAP in i or vice versa
 //            printf("%d %d %d\n", k, i, (int) X[k][i - 1]);
             if ((i == 0  && X[k][i] < MultipleAlignment::ANY) ||
                 (i != 0  && X[k][i - 1] >= MultipleAlignment::ANY && X[k][i] < MultipleAlignment::ANY)) {  // ... if sequence k was NOT included in i-1 and has to be included for column i
-                change = 1;
+                change = true;
                 nseqi++;
                 for (size_t j = 0; j < queryLength; ++j){
                     n[j][(int) X[k][j]]++;
                 }
             } else if ( i != 0 && X[k][i - 1] < MultipleAlignment::ANY && X[k][i] >= MultipleAlignment::ANY) {  // ... if sequence k WAS included in i-1 and has to be thrown out for column i
-                change = 1;
+                change = true;
                 nseqi--;
                 for (size_t j = 0; j < queryLength; ++j)
                     n[j][(int) X[k][j]]--;
@@ -460,7 +458,7 @@ void PSSMCalculator::computeContextSpecificWeights(float * matchWeight, float *w
     free(f);
 }
 
-std::string PSSMCalculator::computeConsensusSequence(float *frequency, size_t queryLength, double *pBack, char *int2aa) {
+std::string PSSMCalculator::computeConsensusSequence(float *frequency, size_t queryLength, double *pBack, char *num2aa) {
     std::string consens;
     for (size_t pos = 0; pos < queryLength; pos++) {
         float maxw = 1E-8;
@@ -472,7 +470,7 @@ std::string PSSMCalculator::computeConsensusSequence(float *frequency, size_t qu
                 maxa = aa;
             }
         }
-        consens.push_back(int2aa[maxa]);
+        consens.push_back(num2aa[maxa]);
     }
     return consens;
 }
diff --git a/src/alignment/PSSMCalculator.h b/src/alignment/PSSMCalculator.h
index 5808f1d..cc3f1b1 100644
--- a/src/alignment/PSSMCalculator.h
+++ b/src/alignment/PSSMCalculator.h
@@ -89,7 +89,7 @@ class PSSMCalculator {
     float pca;
     float pcb;
 
-    std::string computeConsensusSequence(float *pDouble, size_t queryLength, double *back, char *int2aa);
+    std::string computeConsensusSequence(float *pDouble, size_t queryLength, double *back, char *num2aa);
 };
 
 
diff --git a/src/alignment/StripedSmithWaterman.cpp b/src/alignment/StripedSmithWaterman.cpp
index f231ec0..9cb3b24 100644
--- a/src/alignment/StripedSmithWaterman.cpp
+++ b/src/alignment/StripedSmithWaterman.cpp
@@ -23,13 +23,14 @@
    Written by Michael Farrar, 2006 (alignment), Mengyao Zhao (SSW Library) and Martin Steinegger (change structure add aa composition, profile and AVX2 support).
    Please send bug reports and/or suggestions to martin.steinegger@mpibpc.mpg.de.
 */
-#include <Parameters.h>
+#include "Parameters.h"
 #include "StripedSmithWaterman.h"
 
 #include "Util.h"
 #include "SubstitutionMatrix.h"
 #include "Debug.h"
 
+#include <iostream>
 
 SmithWaterman::SmithWaterman(size_t maxSequenceLength, int aaSize, bool aaBiasCorrection) {
 	maxSequenceLength += 1;
@@ -127,7 +128,7 @@ void SmithWaterman::createQueryProfile(simd_int *profile, const int8_t *query_se
 
 
 s_align SmithWaterman::ssw_align (
-		const int *db_sequence,
+		const unsigned char *db_sequence,
 		int32_t db_length,
 		const uint8_t gap_open,
 		const uint8_t gap_extend,
@@ -181,28 +182,26 @@ s_align SmithWaterman::ssw_align (
 		r.ref_end2 = -1;
 	}
 
-    // need to be defined before goto end
-    int32_t queryOffset;
-    bool hasLowerEvalue;
-    bool hasLowerCoverage;
+    const bool isProfile = Parameters::isEqualDbtype(profile->sequence_type, Parameters::DBTYPE_HMM_PROFILE)
+                         || Parameters::isEqualDbtype(profile->sequence_type, Parameters::DBTYPE_PROFILE_STATE_PROFILE);
     // no residue could be aligned
     if (r.dbEndPos1 == -1) {
-        goto end;
+        return r;
     }
-	queryOffset = query_length - r.qEndPos1;
+    int32_t queryOffset = query_length - r.qEndPos1;
 	r.evalue = evaluer->computeEvalue(r.score1, query_length);
-	hasLowerEvalue = r.evalue > evalueThr;
+    bool hasLowerEvalue = r.evalue > evalueThr;
 	r.qCov = computeCov(0, r.qEndPos1, query_length);
 	r.tCov = computeCov(0, r.dbEndPos1, db_length);
-    hasLowerCoverage = !(Util::hasCoverage(covThr, covMode, r.qCov, r.tCov));
+    bool hasLowerCoverage = !(Util::hasCoverage(covThr, covMode, r.qCov, r.tCov));
 
 	if (alignmentMode == 0 || ((alignmentMode == 2 || alignmentMode == 1) && (hasLowerEvalue || hasLowerCoverage))) {
-		goto end;
+        return r;
 	}
 
 	// Find the beginning position of the best alignment.
 	if (word == 0) {
-		if(Parameters::isEqualDbtype(profile->sequence_type, Parameters::DBTYPE_HMM_PROFILE) || Parameters::isEqualDbtype(profile->sequence_type, Parameters::DBTYPE_PROFILE_STATE_PROFILE)) {
+		if (isProfile) {
 			createQueryProfile<int8_t, VECSIZE_INT * 4, PROFILE>(profile->profile_rev_byte, profile->query_rev_sequence, NULL, profile->mat_rev,
 																 r.qEndPos1 + 1, profile->alphabetSize, profile->bias, queryOffset, profile->query_length);
 		} else {
@@ -212,7 +211,7 @@ s_align SmithWaterman::ssw_align (
 		bests_reverse = sw_sse2_byte(db_sequence, 1, r.dbEndPos1 + 1, r.qEndPos1 + 1, gap_open, gap_extend, profile->profile_rev_byte,
 									 r.score1, profile->bias, maskLen);
 	} else {
-		if(Parameters::isEqualDbtype(profile->sequence_type, Parameters::DBTYPE_HMM_PROFILE) || Parameters::isEqualDbtype(profile->sequence_type, Parameters::DBTYPE_PROFILE_STATE_PROFILE)) {
+		if (isProfile) {
 			createQueryProfile<int16_t, VECSIZE_INT * 2, PROFILE>(profile->profile_rev_word, profile->query_rev_sequence, NULL, profile->mat_rev,
 																  r.qEndPos1 + 1, profile->alphabetSize, 0, queryOffset, profile->query_length);
 
@@ -239,15 +238,17 @@ s_align SmithWaterman::ssw_align (
 	r.qCov = computeCov(r.qStartPos1, r.qEndPos1, query_length);
 	r.tCov = computeCov(r.dbStartPos1, r.dbEndPos1, db_length);
 	hasLowerCoverage = !(Util::hasCoverage(covThr, covMode, r.qCov, r.tCov));
-	if (alignmentMode == 1 || hasLowerCoverage) // just start and end point are needed
-		goto end;
+    // only start and end point are needed
+    if (alignmentMode == 1 || hasLowerCoverage) {
+        return r;
+    }
 
 	// Generate cigar.
 	db_length = r.dbEndPos1 - r.dbStartPos1 + 1;
 	query_length = r.qEndPos1 - r.qStartPos1 + 1;
 	band_width = abs(db_length - query_length) + 1;
 
-	if(Parameters::isEqualDbtype(profile->sequence_type, Parameters::DBTYPE_HMM_PROFILE) || Parameters::isEqualDbtype(profile->sequence_type, Parameters::DBTYPE_PROFILE_STATE_PROFILE)) {
+	if (isProfile) {
 		path = banded_sw<PROFILE>(db_sequence + r.dbStartPos1, profile->query_sequence + r.qStartPos1,
 								  NULL, db_length, query_length,
 								  r.qStartPos1, r.score1, gap_open, gap_extend, band_width,
@@ -265,8 +266,6 @@ s_align SmithWaterman::ssw_align (
 		r.cigarLen = path->length;
 	}	delete path;
 
-
-	end:
 	return r;
 }
 
@@ -299,7 +298,7 @@ uint32_t SmithWaterman::cigar_int_to_len (uint32_t cigar_int)
 	return res;
 }
 
-std::pair<SmithWaterman::alignment_end, SmithWaterman::alignment_end> SmithWaterman::sw_sse2_byte (const int* db_sequence,
+std::pair<SmithWaterman::alignment_end, SmithWaterman::alignment_end> SmithWaterman::sw_sse2_byte (const unsigned char* db_sequence,
 														   int8_t ref_dir,	// 0: forward ref; 1: reverse ref
 														   int32_t db_length,
 														   int32_t query_length,
@@ -527,10 +526,10 @@ std::pair<SmithWaterman::alignment_end, SmithWaterman::alignment_end> SmithWater
 }
 
 
-std::pair<SmithWaterman::alignment_end, SmithWaterman::alignment_end> SmithWaterman::sw_sse2_word (const int* db_sequence,
+std::pair<SmithWaterman::alignment_end, SmithWaterman::alignment_end> SmithWaterman::sw_sse2_word (const unsigned char* db_sequence,
 														   int8_t ref_dir,	// 0: forward ref; 1: reverse ref
 														   int32_t db_length,
-														   int32_t query_lenght,
+														   int32_t query_length,
 														   const uint8_t gap_open, /* will be used as - */
 														   const uint8_t gap_extend, /* will be used as - */
 														   const simd_int*query_profile_word,
@@ -540,10 +539,10 @@ std::pair<SmithWaterman::alignment_end, SmithWaterman::alignment_end> SmithWater
 #define max8(m, vm) ((m) = simdi16_hmax((vm)));
 
 	uint16_t max = 0;		                     /* the max alignment score */
-	int32_t end_read = query_lenght - 1;
+	int32_t end_read = query_length - 1;
 	int32_t end_ref = 0; /* 1_based best alignment ending point; Initialized as isn't aligned - 0. */
 	const unsigned int SIMD_SIZE = VECSIZE_INT * 2;
-	int32_t segLen = (query_lenght + SIMD_SIZE-1) / SIMD_SIZE; /* number of segment */
+	int32_t segLen = (query_length + SIMD_SIZE-1) / SIMD_SIZE; /* number of segment */
 	/* array to record the alignment read ending position of the largest score of each reference position */
 	memset(this->maxColumn, 0, db_length * sizeof(uint16_t));
 	uint16_t * maxColumn = (uint16_t *) this->maxColumn;
@@ -702,25 +701,24 @@ std::pair<SmithWaterman::alignment_end, SmithWaterman::alignment_end> SmithWater
 #undef max8
 }
 
-void SmithWaterman::ssw_init (const Sequence* q,
-							  const int8_t* mat,
-							  const BaseMatrix *m,
-							  const int32_t alphabetSize,
-							  const int8_t score_size) {
+void SmithWaterman::ssw_init(const Sequence* q,
+							 const int8_t* mat,
+							 const BaseMatrix *m,
+							 const int8_t score_size) {
 
 	profile->bias = 0;
 	profile->sequence_type = q->getSequenceType();
+    const int32_t alphabetSize = m->alphabetSize;
 	int32_t compositionBias = 0;
-	bool isProfile = Parameters::isEqualDbtype(q->getSequenceType(), Parameters::DBTYPE_HMM_PROFILE) || Parameters::isEqualDbtype(q->getSequenceType(), Parameters::DBTYPE_PROFILE_STATE_PROFILE);
-	if(isProfile == false && aaBiasCorrection == true) {
-		SubstitutionMatrix::calcLocalAaBiasCorrection(m, q->int_sequence, q->L, tmp_composition_bias);
+	bool isProfile = Parameters::isEqualDbtype(q->getSequenceType(), Parameters::DBTYPE_HMM_PROFILE)
+	               || Parameters::isEqualDbtype(q->getSequenceType(), Parameters::DBTYPE_PROFILE_STATE_PROFILE);
+	if (!isProfile && aaBiasCorrection) {
+		SubstitutionMatrix::calcLocalAaBiasCorrection(m, q->numSequence, q->L, tmp_composition_bias);
 		for (int i =0; i < q->L; i++) {
 			profile->composition_bias[i] = (int8_t) (tmp_composition_bias[i] < 0.0)? tmp_composition_bias[i] - 0.5: tmp_composition_bias[i] + 0.5;
-			compositionBias = (static_cast<int8_t>(compositionBias) < profile->composition_bias[i])
-							  ? compositionBias  :  profile->composition_bias[i];
+			compositionBias = (compositionBias < profile->composition_bias[i]) ? compositionBias : profile->composition_bias[i];
 		}
-		compositionBias = std::min(compositionBias, static_cast<int32_t>(0));
-//		std::cout << compositionBias << std::endl;
+		compositionBias = std::min(compositionBias, 0);
 	} else {
 		memset(profile->composition_bias, 0, q->L* sizeof(int8_t));
 	}
@@ -734,9 +732,7 @@ void SmithWaterman::ssw_init (const Sequence* q,
 	} else {
 		memcpy(profile->mat, mat, alphabetSize * alphabetSize * sizeof(int8_t));
 	}
-	for (int i = 0; i < q->L; i++) {
-		profile->query_sequence[i] = (int8_t) q->int_sequence[i];
-	}
+	memcpy(profile->query_sequence, q->numSequence, q->L);
 	if (score_size == 0 || score_size == 2) {
 		/* Find the bias to use in the substitution matrix */
 		int32_t bias = 0;
@@ -754,14 +750,14 @@ void SmithWaterman::ssw_init (const Sequence* q,
 		}
 		bias = abs(bias) + abs(compositionBias);
 		profile->bias = bias;
-		if(Parameters::isEqualDbtype(q->getSequenceType(), Parameters::DBTYPE_HMM_PROFILE) || Parameters::isEqualDbtype(q->getSequenceType(), Parameters::DBTYPE_PROFILE_STATE_PROFILE)){
+		if (isProfile) {
 			createQueryProfile<int8_t, VECSIZE_INT * 4, PROFILE>(profile->profile_byte, profile->query_sequence, NULL, profile->mat, q->L, alphabetSize, bias, 1, q->L);
 		} else {
 			createQueryProfile<int8_t, VECSIZE_INT * 4, SUBSTITUTIONMATRIX>(profile->profile_byte, profile->query_sequence, profile->composition_bias, profile->mat, q->L, alphabetSize, bias, 0, 0);
 		}
 	}
 	if (score_size == 1 || score_size == 2) {
-		if(Parameters::isEqualDbtype(q->getSequenceType(), Parameters::DBTYPE_HMM_PROFILE) || Parameters::isEqualDbtype(q->getSequenceType(), Parameters::DBTYPE_PROFILE_STATE_PROFILE)){
+		if (isProfile) {
 			createQueryProfile<int16_t, VECSIZE_INT * 2, PROFILE>(profile->profile_word, profile->query_sequence, NULL, profile->mat, q->L, alphabetSize, 0, 1, q->L);
 			for (int32_t i = 0; i< alphabetSize; i++) {
 				profile->profile_word_linear[i] = &profile_word_linear_data[i*q->L];
@@ -775,7 +771,7 @@ void SmithWaterman::ssw_init (const Sequence* q,
 			for(int32_t i = 0; i< alphabetSize; i++) {
 				profile->profile_word_linear[i] = &profile_word_linear_data[i*q->L];
 				for (int j = 0; j < q->L; j++) {
-					profile->profile_word_linear[i][j] = mat[i * alphabetSize + q->int_sequence[j]] + profile->composition_bias[j];
+					profile->profile_word_linear[i][j] = mat[i * alphabetSize + q->numSequence[j]] + profile->composition_bias[j];
 				}
 			}
 		}
@@ -786,8 +782,7 @@ void SmithWaterman::ssw_init (const Sequence* q,
 	seq_reverse( profile->query_rev_sequence, profile->query_sequence, q->L);
 	seq_reverse( profile->composition_bias_rev, profile->composition_bias, q->L);
 
-	if(Parameters::isEqualDbtype(q->getSequenceType(), Parameters::DBTYPE_HMM_PROFILE) ||
-	   Parameters::isEqualDbtype(q->getSequenceType(), Parameters::DBTYPE_PROFILE_STATE_PROFILE)) {
+	if (isProfile) {
 		for (int32_t i = 0; i < alphabetSize; i++) {
 			const int8_t *startToRead = profile->mat + (i * q->L);
 			int8_t *startToWrite      = profile->mat_rev + (i * q->L);
@@ -798,7 +793,7 @@ void SmithWaterman::ssw_init (const Sequence* q,
 	profile->alphabetSize = alphabetSize;
 }
 template <const unsigned int type>
-SmithWaterman::cigar * SmithWaterman::banded_sw(const int *db_sequence, const int8_t *query_sequence, const int8_t * compositionBias,
+SmithWaterman::cigar * SmithWaterman::banded_sw(const unsigned char *db_sequence, const int8_t *query_sequence, const int8_t * compositionBias,
 												int32_t db_length, int32_t query_length, int32_t queryStart,
 												int32_t score, const uint32_t gap_open,
 												const uint32_t gap_extend, int32_t band_width, const int8_t *mat, int32_t n) {
@@ -1076,7 +1071,7 @@ float SmithWaterman::computeCov(unsigned int startPos, unsigned int endPos, unsi
 	return (std::min(len, endPos) - startPos + 1) / (float) len;
 }
 
-s_align SmithWaterman::scoreIdentical(int *dbSeq, int L, EvalueComputation * evaluer, int alignmentMode) {
+s_align SmithWaterman::scoreIdentical(unsigned char *dbSeq, int L, EvalueComputation * evaluer, int alignmentMode) {
 	if(profile->query_length != L){
 		std::cerr << "scoreIdentical has different length L: "
 				  << L << " query_length: " << profile->query_length
@@ -1112,7 +1107,7 @@ s_align SmithWaterman::scoreIdentical(int *dbSeq, int L, EvalueComputation * eva
 	return r;
 }
 
-int SmithWaterman::ungapped_alignment(const int *db_sequence, int32_t db_length) {
+int SmithWaterman::ungapped_alignment(const unsigned char *db_sequence, int32_t db_length) {
 #define SWAP(tmp, arg1, arg2) tmp = arg1; arg1 = arg2; arg2 = tmp;
 
 	int i; // position in query bands (0,..,W-1)
diff --git a/src/alignment/StripedSmithWaterman.h b/src/alignment/StripedSmithWaterman.h
index 79b3984..d850e18 100644
--- a/src/alignment/StripedSmithWaterman.h
+++ b/src/alignment/StripedSmithWaterman.h
@@ -35,7 +35,6 @@
 
 #include <cstdio>
 #include <cstdlib>
-#include <iostream>
 
 #if !defined(__APPLE__) && !defined(__llvm__)
 #include <malloc.h>
@@ -131,7 +130,7 @@ class SmithWaterman{
      while bit 8 is not, the function will return cigar only when both criteria are fulfilled. All returned positions are
      0-based coordinate.
      */
-    s_align  ssw_align (const int*db_sequence,
+    s_align  ssw_align (const unsigned char*db_sequence,
                         int32_t db_length,
                         const uint8_t gap_open,
                         const uint8_t gap_extend,
@@ -150,7 +149,7 @@ class SmithWaterman{
    @param	db_length	length of the target sequence
    @return	max diagonal score
    */
-   int ungapped_alignment(const int *db_sequence,
+   int ungapped_alignment(const unsigned char *db_sequence,
                           int32_t db_length);
 
   /*!	@function	Create the query profile using the query sequence.
@@ -171,8 +170,7 @@ class SmithWaterman{
    -2 -2 -2  2 //T
    mat is the pointer to the array {2, -2, -2, -2, -2, 2, -2, -2, -2, -2, 2, -2, -2, -2, -2, 2}
    */
-    void ssw_init(const Sequence *q, const int8_t *mat, const BaseMatrix *m, const int32_t alphabetSize,
-                  const int8_t score_size);
+    void ssw_init(const Sequence *q, const int8_t *mat, const BaseMatrix *m, const int8_t score_size);
 
 
     static char cigar_int_to_op (uint32_t cigar_int);
@@ -182,7 +180,7 @@ class SmithWaterman{
 
     static float computeCov(unsigned int startPos, unsigned int endPos, unsigned int len);
 
-    s_align scoreIdentical(int *dbSeq, int L, EvalueComputation * evaluer, int alignmentMode);
+    s_align scoreIdentical(unsigned char *dbSeq, int L, EvalueComputation * evaluer, int alignmentMode);
 
     static void seq_reverse(int8_t * reverse, const int8_t* seq, int32_t end)	/* end is 0-based alignment ending position */
     {
@@ -209,7 +207,7 @@ class SmithWaterman{
         int8_t* composition_bias_rev;
         int8_t* mat;
         // Memory layout of if mat + queryProfile is qL * AA
-        //    Query lenght
+        //    Query length
         // A  -1  -3  -2  -1  -4  -2  -2  -3  -1  -3  -2  -2   7  -1  -2  -1  -1  -2  -5  -3
         // C  -1  -4   2   5  -3  -2   0  -3   1  -3  -2   0  -1   2   0   0  -1  -3  -4  -2
         // ...
@@ -252,7 +250,7 @@ class SmithWaterman{
      wight_match > 0, all other weights < 0.
      The returned positions are 0-based.
      */
-    std::pair<alignment_end, alignment_end> sw_sse2_byte (const int*db_sequence,
+    std::pair<alignment_end, alignment_end> sw_sse2_byte (const unsigned char*db_sequence,
                                  int8_t ref_dir,	// 0: forward ref; 1: reverse ref
                                  int32_t db_length,
                                  int32_t query_length,
@@ -266,10 +264,10 @@ class SmithWaterman{
                                  uint8_t bias,  /* Shift 0 point to a positive value. */
                                  int32_t maskLen);
 
-    std::pair<alignment_end, alignment_end> sw_sse2_word (const int* db_sequence,
+    std::pair<alignment_end, alignment_end> sw_sse2_word (const unsigned char* db_sequence,
                                  int8_t ref_dir,	// 0: forward ref; 1: reverse ref
                                  int32_t db_length,
-                                 int32_t query_lenght,
+                                 int32_t query_length,
                                  const uint8_t gap_open, /* will be used as - */
                                  const uint8_t gap_extend, /* will be used as - */
                                  const simd_int*query_profile_byte,
@@ -277,7 +275,7 @@ class SmithWaterman{
                                  int32_t maskLen);
 
     template <const unsigned int type>
-    SmithWaterman::cigar *banded_sw(const int *db_sequence, const int8_t *query_sequence, const int8_t * compositionBias, int32_t db_length, int32_t query_length, int32_t queryStart, int32_t score, const uint32_t gap_open, const uint32_t gap_extend, int32_t band_width, const int8_t *mat, int32_t n);
+    SmithWaterman::cigar *banded_sw(const unsigned char *db_sequence, const int8_t *query_sequence, const int8_t * compositionBias, int32_t db_length, int32_t query_length, int32_t queryStart, int32_t score, const uint32_t gap_open, const uint32_t gap_extend, int32_t band_width, const int8_t *mat, int32_t n);
 
     /*!	@function		Produce CIGAR 32-bit unsigned integer from CIGAR operation and CIGAR length
      @param	length		length of CIGAR
diff --git a/src/alignment/rescorediagonal.cpp b/src/alignment/rescorediagonal.cpp
index de9f326..26b19e1 100644
--- a/src/alignment/rescorediagonal.cpp
+++ b/src/alignment/rescorediagonal.cpp
@@ -135,7 +135,7 @@ int doRescorediagonal(Parameters &par,
             char *queryRevSeq = NULL;
             int queryRevSeqLen = par.maxSeqLen + 1;
             if (reversePrefilterResult == true) {
-                queryRevSeq = new char[queryRevSeqLen];
+                queryRevSeq = static_cast<char*>(malloc(queryRevSeqLen));
             }
 #pragma omp for schedule(dynamic, 1)
             for (size_t id = start; id < (start + bucketSize); id++) {
@@ -161,16 +161,15 @@ int doRescorediagonal(Parameters &par,
                         queryLen = origQueryLen*2;
                     }
 
-                    if(queryLen > queryRevSeqLen){
-                        delete [] queryRevSeq;
-                        queryRevSeq = new char[queryLen];
-                        queryRevSeqLen = queryLen;
+                    if(reversePrefilterResult == true && queryLen > queryRevSeqLen){
+                        queryRevSeq = static_cast<char*>(realloc(queryRevSeq, queryLen+1));
+                        queryRevSeqLen = queryLen+1;
                     }
                     if (reversePrefilterResult == true) {
                         NucleotideMatrix *nuclMatrix = (NucleotideMatrix *) subMat;
                         for (int pos = queryLen - 1; pos > -1; pos--) {
-                            int res = subMat->aa2int[static_cast<int>(querySeq[pos])];
-                            queryRevSeq[(queryLen - 1) - pos] = subMat->int2aa[nuclMatrix->reverseResidue(res)];
+                            unsigned char res = subMat->aa2num[static_cast<int>(querySeq[pos])];
+                            queryRevSeq[(queryLen - 1) - pos] = subMat->num2aa[nuclMatrix->reverseResidue(res)];
                         }
                     }
                     if (sameQTDB && qdbr->isCompressed()) {
@@ -315,13 +314,14 @@ int doRescorediagonal(Parameters &par,
                         } else if (par.rescoreMode == Parameters::RESCORE_MODE_SUBSTITUTION) {
                             hit_t hit;
                             hit.seqId = results[entryIdx].seqId;
-                            hit.prefScore = bitScore;
+                            hit.prefScore = (isReverse) ? -bitScore : bitScore;
                             hit.diagonal = diagonal;
                             shortResults.emplace_back(hit);
                         } else {
                             hit_t hit;
                             hit.seqId = results[entryIdx].seqId;
                             hit.prefScore = 100 * seqId;
+                            hit.prefScore = (isReverse) ? -hit.prefScore : hit.prefScore;
                             hit.diagonal = diagonal;
                             shortResults.emplace_back(hit);
                         }
@@ -340,8 +340,7 @@ int doRescorediagonal(Parameters &par,
                     std::sort(shortResults.begin(), shortResults.end(), hit_t::compareHitsByScoreAndId);
                 }
                 for (size_t i = 0; i < shortResults.size(); ++i) {
-                    size_t len = snprintf(buffer, 100, "%u\t%d\t%d\n", shortResults[i].seqId, shortResults[i].prefScore,
-                                          shortResults[i].diagonal);
+                    size_t len = QueryMatcher::prefilterHitToBuffer(buffer, shortResults[i]);
                     resultBuffer.append(buffer, len);
                 }
 
@@ -351,7 +350,7 @@ int doRescorediagonal(Parameters &par,
                 alnResults.clear();
             }
             if (reversePrefilterResult == true) {
-                delete [] queryRevSeq;
+                free(queryRevSeq);
             }
         }
         resultReader.remapData();
@@ -387,7 +386,7 @@ int rescorediagonal(int argc, const char **argv, const Command &command) {
 
     DBReader<unsigned int> resultReader(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
     resultReader.open(DBReader<unsigned int>::LINEAR_ACCCESS);
-    int dbtype = Parameters::DBTYPE_PREFILTER_RES;
+    int dbtype = resultReader.getDbtype(); // this is DBTYPE_PREFILTER_RES || DBTYPE_PREFILTER_REV_RES
     if(par.rescoreMode == Parameters::RESCORE_MODE_ALIGNMENT ||
        par.rescoreMode == Parameters::RESCORE_MODE_GLOBAL_ALIGNMENT ||
        par.rescoreMode == Parameters::RESCORE_MODE_WINDOW_QUALITY_ALIGNMENT){
diff --git a/src/clustering/AlignmentSymmetry.cpp b/src/clustering/AlignmentSymmetry.cpp
index 85bfd8f..24e72ea 100644
--- a/src/clustering/AlignmentSymmetry.cpp
+++ b/src/clustering/AlignmentSymmetry.cpp
@@ -20,6 +20,7 @@
 void AlignmentSymmetry::readInData(DBReader<unsigned int>*alnDbr, DBReader<unsigned int>*seqDbr,
                                    unsigned int **elementLookupTable, unsigned short **elementScoreTable,
                                    int scoretype, size_t *offsets) {
+    const int alnType = alnDbr->getDbtype();
     const size_t dbSize = seqDbr->getSize();
     const size_t flushSize = 1000000;
     size_t iterations = static_cast<int>(ceil(static_cast<double>(dbSize)/static_cast<double>(flushSize)));
@@ -42,9 +43,22 @@ void AlignmentSymmetry::readInData(DBReader<unsigned int>*alnDbr, DBReader<unsig
                 char *data = alnDbr->getDataByDBKey(clusterId, thread_idx);
 
                 if (*data == '\0') { // check if file contains entry
-                    Debug(Debug::ERROR) << "Sequence " << i
-                                        << " does not contain any sequence for key " << clusterId
-                                        << "!\n";
+                    elementLookupTable[i][0] = seqDbr->getId(clusterId);
+                    if (elementScoreTable != NULL) {
+                        if (Parameters::isEqualDbtype(alnType, Parameters::DBTYPE_ALIGNMENT_RES)) {
+                            if (scoretype == Parameters::APC_ALIGNMENTSCORE) {
+                                //column 1 = alignment score
+                                elementScoreTable[i][0] = (unsigned short) (USHRT_MAX);
+                            } else {
+                                //column 2 = sequence identity [0-1]
+                                elementScoreTable[i][0] = (unsigned short) (1.0 * 1000.0f);
+                            }
+                        } else if (Parameters::isEqualDbtype(alnType, Parameters::DBTYPE_PREFILTER_RES) ||
+                                   Parameters::isEqualDbtype(alnType, Parameters::DBTYPE_PREFILTER_REV_RES)) {
+                            //column 1 = alignment score or sequence identity [0-100]
+                            elementScoreTable[i][0] = (unsigned short) (USHRT_MAX);
+                        }
+                    }
                     continue;
                 }
                 size_t setSize = LEN(offsets, i);
@@ -62,15 +76,29 @@ void AlignmentSymmetry::readInData(DBReader<unsigned int>*alnDbr, DBReader<unsig
                     const unsigned int key = (unsigned int) strtoul(dbKey, NULL, 10);
                     const size_t currElement = seqDbr->getId(key);
                     if (elementScoreTable != NULL) {
-                        if (scoretype == Parameters::APC_ALIGNMENTSCORE) {
-                            //column 1 = alignment score
+                        if (Parameters::isEqualDbtype(alnType,Parameters::DBTYPE_ALIGNMENT_RES)) {
+                            if (scoretype == Parameters::APC_ALIGNMENTSCORE) {
+                                //column 1 = alignment score
+                                Util::parseByColumnNumber(data, similarity, 1);
+                                elementScoreTable[i][writePos] = (unsigned short) (atof(similarity));
+                            } else {
+                                //column 2 = sequence identity [0-1]
+                                Util::parseByColumnNumber(data, similarity, 2);
+                                elementScoreTable[i][writePos] = (unsigned short) (atof(similarity) * 1000.0f);
+                            }
+                        }
+                        else if (Parameters::isEqualDbtype(alnType, Parameters::DBTYPE_PREFILTER_RES) ||
+                                 Parameters::isEqualDbtype(alnType, Parameters::DBTYPE_PREFILTER_REV_RES)) {
+                            //column 1 = alignment score or sequence identity [0-100]
                             Util::parseByColumnNumber(data, similarity, 1);
-                            elementScoreTable[i][writePos] = (unsigned short) (atof(similarity));
-                        } else {
-                            //column 2 = sequence identity
-                            Util::parseByColumnNumber(data, similarity, 2);
-                            elementScoreTable[i][writePos] = (unsigned short) (atof(similarity) * 1000.0f);
+                            short sim = atoi(similarity);
+                            elementScoreTable[i][writePos] = (unsigned short) (sim >0 ? sim : -sim);
+                        }
+                        else {
+                            Debug(Debug::ERROR) << "Alignment format is not supported!\n";
+                            EXIT(EXIT_FAILURE);
                         }
+
                     }
                     if (currElement == UINT_MAX || currElement > seqDbr->getSize()) {
                         Debug(Debug::ERROR) << "Element " << dbKey
diff --git a/src/clustering/Clustering.cpp b/src/clustering/Clustering.cpp
index 50bf727..2eef238 100644
--- a/src/clustering/Clustering.cpp
+++ b/src/clustering/Clustering.cpp
@@ -41,7 +41,7 @@ void Clustering::run(int mode) {
 
     if (mode == Parameters::GREEDY) {
         Debug(Debug::INFO) << "Clustering mode: Greedy\n";
-        ret = algorithm->execute(2);
+        ret = algorithm->execute(4);
     } else if (mode == Parameters::GREEDY_MEM) {
         Debug(Debug::INFO) << "Clustering mode: Greedy Low Mem\n";
         ret = algorithm->execute(4);
diff --git a/src/clustering/Clustering.h b/src/clustering/Clustering.h
index 1822694..4f0ff8a 100644
--- a/src/clustering/Clustering.h
+++ b/src/clustering/Clustering.h
@@ -1,8 +1,6 @@
 #ifndef CLUSTERING_H
 #define CLUSTERING_H
 
-#include <list>
-#include <string>
 #include <unordered_map>
 
 #include "DBReader.h"
diff --git a/src/clustering/ClusteringAlgorithms.cpp b/src/clustering/ClusteringAlgorithms.cpp
index 4411699..b398ccb 100644
--- a/src/clustering/ClusteringAlgorithms.cpp
+++ b/src/clustering/ClusteringAlgorithms.cpp
@@ -42,7 +42,7 @@ std::unordered_map<unsigned int, std::vector<unsigned int>>  ClusteringAlgorithm
     std::fill_n(assignedcluster, dbSize, UINT_MAX);
 
     //time
-    if (mode==4) {
+    if (mode==4 || mode==2) {
         greedyIncrementalLowMem(assignedcluster);
     }else {
         size_t elementCount = 0;
@@ -56,7 +56,7 @@ std::unordered_map<unsigned int, std::vector<unsigned int>>  ClusteringAlgorithm
             for (size_t i = 0; i < alnDbr->getSize(); i++) {
                 const char *data = alnDbr->getData(i, thread_idx);
                 const size_t dataSize = alnDbr->getEntryLen(i);
-                elementCount += Util::countLines(data, dataSize);
+                elementCount += (*data == '\0') ? 1 : Util::countLines(data, dataSize);
             }
         }
         unsigned int * elements = new(std::nothrow) unsigned int[elementCount];
@@ -74,51 +74,45 @@ std::unordered_map<unsigned int, std::vector<unsigned int>>  ClusteringAlgorithm
         std::fill_n(bestscore, dbSize, SHRT_MIN);
 
         readInClusterData(elementLookupTable, elements, scoreLookupTable, score, elementOffsets, elementCount);
-
-
-        if (mode==2){
-            greedyIncremental(elementLookupTable, elementOffsets,
-                              dbSize, assignedcluster);
-        }else {
-            ClusteringAlgorithms::initClustersizes();
-            if (mode == 1) {
-                setCover(elementLookupTable, scoreLookupTable, assignedcluster, bestscore, elementOffsets);
-            } else if (mode == 3) {
-                Debug(Debug::INFO) << "connected component mode" << "\n";
-                for (int cl_size = dbSize - 1; cl_size >= 0; cl_size--) {
-                    unsigned int representative = sorted_clustersizes[cl_size];
-                    if (assignedcluster[representative] == UINT_MAX) {
-                        assignedcluster[representative] = representative;
-                        std::queue<int> myqueue;
-                        myqueue.push(representative);
-                        std::queue<int> iterationcutoffs;
-                        iterationcutoffs.push(0);
-                        //delete clusters of members;
-                        while (!myqueue.empty()) {
-                            int currentid = myqueue.front();
-                            int iterationcutoff = iterationcutoffs.front();
-                            assignedcluster[currentid] = representative;
-                            myqueue.pop();
-                            iterationcutoffs.pop();
-                            size_t elementSize = (elementOffsets[currentid + 1] - elementOffsets[currentid]);
-                            for (size_t elementId = 0; elementId < elementSize; elementId++) {
-                                unsigned int elementtodelete = elementLookupTable[currentid][elementId];
-                                if (assignedcluster[elementtodelete] == UINT_MAX && iterationcutoff < maxiterations) {
-                                    myqueue.push(elementtodelete);
-                                    iterationcutoffs.push((iterationcutoff + 1));
-                                }
-                                assignedcluster[elementtodelete] = representative;
+        ClusteringAlgorithms::initClustersizes();
+        if (mode == 1) {
+            setCover(elementLookupTable, scoreLookupTable, assignedcluster, bestscore, elementOffsets);
+        } else if (mode == 3) {
+            Debug(Debug::INFO) << "connected component mode" << "\n";
+            for (int cl_size = dbSize - 1; cl_size >= 0; cl_size--) {
+                unsigned int representative = sorted_clustersizes[cl_size];
+                if (assignedcluster[representative] == UINT_MAX) {
+                    assignedcluster[representative] = representative;
+                    std::queue<int> myqueue;
+                    myqueue.push(representative);
+                    std::queue<int> iterationcutoffs;
+                    iterationcutoffs.push(0);
+                    //delete clusters of members;
+                    while (!myqueue.empty()) {
+                        int currentid = myqueue.front();
+                        int iterationcutoff = iterationcutoffs.front();
+                        assignedcluster[currentid] = representative;
+                        myqueue.pop();
+                        iterationcutoffs.pop();
+                        size_t elementSize = (elementOffsets[currentid + 1] - elementOffsets[currentid]);
+                        for (size_t elementId = 0; elementId < elementSize; elementId++) {
+                            unsigned int elementtodelete = elementLookupTable[currentid][elementId];
+                            if (assignedcluster[elementtodelete] == UINT_MAX && iterationcutoff < maxiterations) {
+                                myqueue.push(elementtodelete);
+                                iterationcutoffs.push((iterationcutoff + 1));
                             }
+                            assignedcluster[elementtodelete] = representative;
                         }
-
                     }
+
                 }
             }
-            //delete unnecessary datastructures
-            delete [] sorted_clustersizes;
-            delete [] clusterid_to_arrayposition;
-            delete [] borders_of_set;
         }
+        //delete unnecessary datastructures
+        delete [] sorted_clustersizes;
+        delete [] clusterid_to_arrayposition;
+        delete [] borders_of_set;
+
 
         delete [] elementLookupTable;
         delete [] elements;
@@ -308,6 +302,7 @@ void ClusteringAlgorithms::greedyIncrementalLowMem( unsigned int *assignedcluste
                 char dbKey[255 + 1];
                 Util::parseKey(data, dbKey);
                 const unsigned int key = (unsigned int) strtoul(dbKey, NULL, 10);
+
                 unsigned int currElement = seqDbr->getId(key);
                 unsigned int targetId;
 
@@ -326,73 +321,16 @@ void ClusteringAlgorithms::greedyIncrementalLowMem( unsigned int *assignedcluste
         }
     }
 
-#pragma omp parallel
-    {
-        int thread_idx = 0;
-#ifdef OPENMP
-        thread_idx = omp_get_thread_num();
-#endif
-#pragma omp for schedule(dynamic, 1000)
-        for (size_t id = 0; id < dbSize; id++) {
-            unsigned int clusterKey = seqDbr->getDbKey(id);
-            unsigned int clusterId = id;
-
-            const size_t alnId = alnDbr->getId(clusterKey);
-            char *data = alnDbr->getData(alnId, thread_idx);
-
-            while (*data != '\0') {
-                char dbKey[255 + 1];
-                Util::parseKey(data, dbKey);
-                const unsigned int key = (unsigned int) strtoul(dbKey, NULL, 10);
-                unsigned int currElement = seqDbr->getId(key);
-                unsigned int targetId;
-
-                __atomic_load(&assignedcluster[currElement], &targetId, __ATOMIC_RELAXED);
-                do {
-                    if (targetId <= clusterId) break;
-                } while (!__atomic_compare_exchange(&assignedcluster[currElement], &targetId, &clusterId, false,
-                                                    __ATOMIC_RELAXED, __ATOMIC_RELAXED));
-
-                if (currElement == UINT_MAX || currElement > seqDbr->getSize()) {
-                    Debug(Debug::ERROR) << "Element " << dbKey
-                                        << " contained in some alignment list, but not contained in the sequence database!\n";
-                    EXIT(EXIT_FAILURE);
-                }
-                data = Util::skipLine(data);
-            }
-        }
-    }
-
     // correct edges that are not assigned properly
     for (size_t id = 0; id < dbSize; ++id) {
         unsigned int assignedClusterId = assignedcluster[id];
+        // check if the assigned clusterid is a rep. sequence
+        // if not, make it a rep. seq. again
         if (assignedcluster[assignedClusterId] != assignedClusterId){
             assignedcluster[assignedClusterId] = assignedClusterId;
         }
     }
-}
 
-void ClusteringAlgorithms::greedyIncremental(unsigned int **elementLookupTable, size_t *elementOffsets,
-                                             size_t n, unsigned int *assignedcluster) {
-    Debug::Progress progress(n);
-    for(size_t i = 0; i < n; i++) {
-        // seqDbr is descending sorted by length
-        // the assumption is that clustering is B -> B (not A -> B)
-        progress.updateProgress();
-        if(assignedcluster[i] == UINT_MAX){
-            size_t elementSize = (elementOffsets[i + 1] - elementOffsets[i]);
-            for (size_t elementId = 0; elementId < elementSize; elementId++) {
-                const unsigned int currElm = elementLookupTable[i][elementId];
-                if(assignedcluster[currElm] == currElm){
-                    assignedcluster[i] = currElm;
-                    break;
-                }
-            }
-            if(assignedcluster[i] == UINT_MAX) {
-                assignedcluster[i] = i;
-            }
-        }
-    }
 }
 
 void ClusteringAlgorithms::readInClusterData(unsigned int **elementLookupTable, unsigned int *&elements,
@@ -411,7 +349,7 @@ void ClusteringAlgorithms::readInClusterData(unsigned int **elementLookupTable,
             const size_t alnId = alnDbr->getId(clusterId);
             const char *data = alnDbr->getData(alnId, thread_idx);
             const size_t dataSize = alnDbr->getEntryLen(alnId);
-            elementOffsets[i] = Util::countLines(data, dataSize);
+            elementOffsets[i] = (*data == '\0') ? 1 : Util::countLines(data, dataSize);
         }
     }
 
diff --git a/src/clustering/Main.cpp b/src/clustering/Main.cpp
index 23d175a..b7c0ee9 100644
--- a/src/clustering/Main.cpp
+++ b/src/clustering/Main.cpp
@@ -1,24 +1,14 @@
-#include <iostream>
 #include "Clustering.h"
 #include "Parameters.h"
-#include "Debug.h"
-
-#ifdef OPENMP
-#include <omp.h>
-#endif
-
 
 int clust(int argc, const char **argv, const Command& command) {
     Parameters& par = Parameters::getInstance();
     par.parseParameters(argc, argv, command, true, 0, 0);
 
-    Clustering* clu = new Clustering(par.db1, par.db1Index, par.db2, par.db2Index,
-                                     par.db3, par.db3Index, par.maxIteration,
-                                     par.similarityScoreType, par.threads, par.compressed);
-
-    clu->run(par.clusteringMode);
-
-    delete clu;
-    return 0;    
+    Clustering clu(par.db1, par.db1Index, par.db2, par.db2Index,
+                   par.db3, par.db3Index, par.maxIteration,
+                   par.similarityScoreType, par.threads, par.compressed);
+    clu.run(par.clusteringMode);
+    return EXIT_SUCCESS;
 }
 
diff --git a/src/commons/A3MReader.cpp b/src/commons/A3MReader.cpp
index 575ba58..d99fa77 100644
--- a/src/commons/A3MReader.cpp
+++ b/src/commons/A3MReader.cpp
@@ -135,7 +135,7 @@ bool A3mReader::columnHasInsertion(size_t col) {
 }
 
 void A3mReader::addSequence(const std::string& sequence) {
-    if (sequence.size() == 0) {
+    if (sequence.empty()) {
         return;
     }
 
@@ -148,7 +148,7 @@ void A3mReader::addSequence(const std::string& sequence) {
     }
 
     // The first sequence is easy.
-    if (entries.size() == 0) {
+    if (entries.empty()) {
         entries.push_back(copy);
         length = sequence.size();
         return;
diff --git a/src/commons/Application.cpp b/src/commons/Application.cpp
index b03e8e5..b5d3023 100644
--- a/src/commons/Application.cpp
+++ b/src/commons/Application.cpp
@@ -4,8 +4,10 @@
 #include "DistanceCalculator.h"
 #include "Timer.h"
 
-#ifndef NEON
+#if !defined(NEON) && !defined(WASM) && !defined(__ALTIVEC__)
 #include <CpuInfo.h>
+#else
+#define NO_CPUINFO
 #endif
 
 #include <iomanip>
@@ -24,7 +26,7 @@ extern std::vector<Command> baseCommands;
 extern std::vector<Categories> categories;
 
 void checkCpu() {
-#ifndef NEON
+#ifndef NO_CPUINFO
     CpuInfo info;
     if (info.HW_x64 == false) {
         Debug(Debug::ERROR) << "64-bit system is required to run MMseqs2.\n";
@@ -73,15 +75,17 @@ int runCommand(Command *p, int argc, const char **argv) {
 
 void printUsage(bool showExtended) {
     std::stringstream usage;
+
     usage << tool_introduction << "\n\n";
     usage << tool_name << " Version: " << version << "\n";
-    usage << "© " << main_author << "\n";
+    usage << "© " << main_author << "\n\n";
+    usage << "usage: " << binary_name << " <command> [<args>]" << "\n";
 
     std::vector<int> showCategoryHeader(categories.size(), 0);
     for (size_t i = 0; i < categories.size(); ++i) {
         for (size_t j = 0; j < commands.size(); j++) {
             Command &p = commands[j];
-            if (p.mode == categories[i].mode) {
+            if (p.mode & categories[i].mode) {
                 showCategoryHeader[i] = 1;
                 break;
             }
@@ -91,7 +95,7 @@ void printUsage(bool showExtended) {
         }
         for (size_t j = 0; j < baseCommands.size(); j++) {
             Command &p = baseCommands[j];
-            if (p.mode == categories[i].mode) {
+            if (p.mode & categories[i].mode) {
                 showCategoryHeader[i] = 1;
                 break;
             }
@@ -101,10 +105,11 @@ void printUsage(bool showExtended) {
 
     for (size_t i = 0; i < categories.size(); ++i) {
         if (showExtended == false
-            && categories[i].mode != COMMAND_MAIN && categories[i].mode != COMMAND_EASY
-            && categories[i].mode != COMMAND_FORMAT_CONVERSION && categories[i].mode != COMMAND_TAXONOMY) {
-            // TODO not ready for prime time yet
-            // && categories[i].mode != COMMAND_MULTIHIT) {
+            && (categories[i].mode & COMMAND_MAIN) == 0
+            && (categories[i].mode & COMMAND_EASY) == 0
+            && (categories[i].mode & COMMAND_DATABASE_CREATION) == 0
+            && (categories[i].mode & COMMAND_FORMAT_CONVERSION) == 0
+            ) {
             continue;
         }
 
@@ -115,8 +120,11 @@ void printUsage(bool showExtended) {
         usage << "\n" << std::setw(20) << categories[i].title << "\n";
         for (size_t j = 0; j < commands.size(); j++) {
             struct Command &p = commands[j];
-            if (p.mode == categories[i].mode) {
-                usage << std::left << std::setw(20) << "  " + std::string(p.cmd) << "\t" << p.shortDescription << "\n";
+            if (showExtended == false && (p.mode & COMMAND_EXPERT) != 0) {
+                continue;
+            }
+            if (p.mode & categories[i].mode) {
+                usage << std::left << std::setw(20) << "  " + std::string(p.cmd) << "\t" << p.description << "\n";
             }
         }
         if (hide_base_commands) {
@@ -124,19 +132,22 @@ void printUsage(bool showExtended) {
         }
         for (size_t j = 0; j < baseCommands.size(); j++) {
             struct Command &p = baseCommands[j];
-            if (p.mode == categories[i].mode) {
-                usage << std::left << std::setw(20) << "  " + std::string(p.cmd) << "\t" << p.shortDescription << "\n";
+            if (showExtended == false && (p.mode & COMMAND_EXPERT) != 0) {
+                continue;
+            }
+            if (p.mode & categories[i].mode) {
+                usage << std::left << std::setw(20) << "  " + std::string(p.cmd) << "\t" << p.description << "\n";
             }
         }
     }
 
     if (show_extended_help != NULL) {
         if (showExtended == false) {
-            usage << "\n\nAn extended list of all tools can be obtained by calling '" << binary_name << " -h'.\n";
+            usage << "\nAn extended list of all modules can be obtained by calling '" << binary_name << " -h'.\n";
         }
     }
     if (show_bash_info != NULL) {
-        usage  << "\nBash completion for tools and parameters can be installed by adding \"source MMSEQS_HOME/util/bash-completion.sh\" to your \"$HOME/.bash_profile\".\nInclude the location of the " << tool_name << " binary in your \"$PATH\" environment variable.";
+        usage  << "\nBash completion for modules and parameters can be installed by adding \"source MMSEQS_HOME/util/bash-completion.sh\" to your \"$HOME/.bash_profile\".\nInclude the location of the " << tool_name << " binary in your \"$PATH\" environment variable.";
     }
     Debug(Debug::INFO) << usage.str() << "\n";
 }
@@ -146,14 +157,14 @@ int shellcompletion(int argc, const char **argv) {
     if (argc == 0) {
         for (size_t i = 0; i < commands.size(); i++) {
             struct Command &p = commands[i];
-            if (p.mode == COMMAND_HIDDEN)
+            if (p.mode & COMMAND_HIDDEN)
                 continue;
             Debug(Debug::INFO) << p.cmd << " ";
         }
         if (hide_base_commands == false) {
             for (size_t i = 0; i < baseCommands.size(); i++) {
                 struct Command &p = baseCommands[i];
-                if (p.mode == COMMAND_HIDDEN)
+                if (p.mode & COMMAND_HIDDEN)
                     continue;
                 Debug(Debug::INFO) << p.cmd << " ";
             }
@@ -227,7 +238,7 @@ int main(int argc, const char **argv) {
         int maxDistance = 0;
         for (size_t i = 0; i < commands.size(); ++i) {
             struct Command &p = commands[i];
-            if (p.mode == COMMAND_HIDDEN) {
+            if (p.mode & COMMAND_HIDDEN) {
                 continue;
             }
 
@@ -241,7 +252,7 @@ int main(int argc, const char **argv) {
         if (hide_base_commands == false) {
             for (size_t i = 0; i < baseCommands.size(); ++i) {
                 struct Command &p = baseCommands[i];
-                if (p.mode == COMMAND_HIDDEN)
+                if (p.mode & COMMAND_HIDDEN)
                     continue;
 
                 int distance = DistanceCalculator::localLevenshteinDistance(argv[1], p.cmd);
diff --git a/src/commons/BaseMatrix.cpp b/src/commons/BaseMatrix.cpp
index 7bff2cf..c6d299e 100644
--- a/src/commons/BaseMatrix.cpp
+++ b/src/commons/BaseMatrix.cpp
@@ -12,18 +12,18 @@ const double BaseMatrix::ANY_BACK = 1E-5;
 BaseMatrix::BaseMatrix(){
     // init [amino acid <-> int] mappings
 
-    int2aa = new char[255];
-    aa2int = new int[UCHAR_MAX];
+    num2aa = new char[255];
+    aa2num = new unsigned char[UCHAR_MAX];
     for (int i = 0; i < UCHAR_MAX; ++i) {
-        aa2int[i]=-1;
+        aa2num[i]=-1;
     }
 
 
 }
 
 BaseMatrix::~BaseMatrix(){
-    delete[] int2aa;
-    delete[] aa2int;
+    delete[] num2aa;
+    delete[] aa2num;
     delete[] pBack;
     for (int i = 0; i < allocatedAlphabetSize; i++){
         delete[] probMatrix[i];
@@ -59,15 +59,15 @@ void BaseMatrix::initMatrixMemory(int alphabetSize) {
 }
 
 
-void BaseMatrix::print(short** matrix, char* int2aa, int size){
+void BaseMatrix::print(short** matrix, char* num2aa, int size){
     std::cout << "\n";
     short avg = 0;
     printf("     ");
     for (int i = 0; i < size; i++)
-        printf("%4c ", int2aa[i]);
+        printf("%4c ", num2aa[i]);
     std::cout << "\n";
     for (int i = 0; i < size; i++){
-        printf("%4c ", int2aa[i]);
+        printf("%4c ", num2aa[i]);
         for (int j = 0; j < size; j++){
             printf("%4d ", matrix[i][j]);
             avg += matrix[i][j];
@@ -77,15 +77,15 @@ void BaseMatrix::print(short** matrix, char* int2aa, int size){
     std::cout << ((double)avg/(double)(size*size)) << "\n";
 }
 
-void BaseMatrix::print(double** matrix, char* int2aa, int size){
+void BaseMatrix::print(double** matrix, char* num2aa, int size){
     std::cout << "\n";
     double avg = 0.0;
     printf("%7c ", ' ');
     for (int i = 0; i < size; i++)
-        printf("%7c ", int2aa[i]);
+        printf("%7c ", num2aa[i]);
     std::cout << "\n";
     for (int i = 0; i < size; i++){
-        printf("%7c ", int2aa[i]);
+        printf("%7c ", num2aa[i]);
         for (int j = 0; j < size; j++){
             printf("%7.4f ", matrix[i][j]);
             avg += matrix[i][j];
diff --git a/src/commons/BaseMatrix.h b/src/commons/BaseMatrix.h
index 237d1a4..0ec8aae 100644
--- a/src/commons/BaseMatrix.h
+++ b/src/commons/BaseMatrix.h
@@ -10,10 +10,10 @@ class BaseMatrix{
     virtual ~BaseMatrix();
 
     /*contains amino acid to int mapping*/
-    int*  aa2int;
+    unsigned char* aa2num;
 
     /*contains int to amino acid mapping*/
-    char* int2aa;
+    char* num2aa;
 
     /* size of alphabet*/
     int alphabetSize;
@@ -42,9 +42,9 @@ class BaseMatrix{
     static const double ANY_BACK;
 
     // print the substitution matrix
-    static void print(short** matrix, char* int2aa, int size);
+    static void print(short** matrix, char* num2aa, int size);
 
-    static void print(double** matrix, char* int2aa, int size);
+    static void print(double** matrix, char* num2aa, int size);
 
     void initMatrixMemory(int alphSize);
 
@@ -81,7 +81,7 @@ class ProbabilityMatrix {
     ProbabilityMatrix(BaseMatrix &matrix) : alphabetSize(matrix.alphabetSize) {
         probMatrix = new double*[matrix.alphabetSize];
         probMatrixPointers = new const double*[matrix.alphabetSize];
-        std::fill_n(hardMaskTable, 256, matrix.aa2int[(int) 'X']);
+        std::fill_n(hardMaskTable, 256, matrix.aa2num[static_cast<int>('X')]);
         for (int i = 0; i < matrix.alphabetSize; ++i) {
             probMatrix[i] = new double[matrix.alphabetSize];
             probMatrixPointers[i] = probMatrix[i];
diff --git a/src/commons/CMakeLists.txt b/src/commons/CMakeLists.txt
index f013dc2..022a79a 100644
--- a/src/commons/CMakeLists.txt
+++ b/src/commons/CMakeLists.txt
@@ -22,6 +22,7 @@ set(commons_header_files
         commons/MathUtil.h
         commons/MemoryMapped.h
         commons/MMseqsMPI.h
+        commons/MultiParam.h
         commons/NucleotideMatrix.h
         commons/Orf.h
         commons/ProfileStates.h
@@ -29,7 +30,6 @@ set(commons_header_files
         commons/Parameters.h
         commons/PatternCompiler.h
         commons/ScoreMatrix.h
-        commons/ScoreMatrixFile.h
         commons/Sequence.h
         commons/SubstitutionMatrix.h
         commons/SubstitutionMatrixProfileStates.h
@@ -57,12 +57,12 @@ set(commons_source_files
         commons/KSeqWrapper.cpp
         commons/MemoryMapped.cpp
         commons/MMseqsMPI.cpp
+        commons/MultiParam.cpp
         commons/NucleotideMatrix.cpp
         commons/Orf.cpp
         commons/Parameters.cpp
         commons/ProfileStates.cpp
         commons/LibraryReader.cpp
-        commons/ScoreMatrixFile.cpp
         commons/Sequence.cpp
         commons/SubstitutionMatrix.cpp
         commons/tantan.cpp
diff --git a/src/commons/Command.cpp b/src/commons/Command.cpp
index f6b9121..636f677 100644
--- a/src/commons/Command.cpp
+++ b/src/commons/Command.cpp
@@ -2,15 +2,23 @@
 #include "Parameters.h"
 
 std::vector<Categories> categories = {
-        {"Easy workflows (for non-experts)",     COMMAND_EASY},
-        {"Main tools  (for non-experts)",        COMMAND_MAIN},
-        {"Utility tools for format conversions", COMMAND_FORMAT_CONVERSION},
-        {"Taxonomy tools",                       COMMAND_TAXONOMY},
-        {"Multi-hit search tools",               COMMAND_MULTIHIT},
-        {"Utility tools for clustering",         COMMAND_CLUSTER},
-        {"Core tools (for advanced users)",      COMMAND_EXPERT},
-        {"Utility tools to manipulate DBs",      COMMAND_DB},
-        {"Special-purpose utilities",            COMMAND_SPECIAL},
+        {"Easy workflows for plain text input/output",   COMMAND_EASY},
+        {"Main workflows for database input/output",     COMMAND_MAIN},
+        {"Input database creation",                      COMMAND_DATABASE_CREATION},
+        {"Handle databases on storage and memory",       COMMAND_STORAGE},
+        {"Unite and intersect databases",                COMMAND_SET},
+        {"Format conversion for downstream processing",  COMMAND_FORMAT_CONVERSION},
+        {"Sequence manipulation/transformation",         COMMAND_SEQUENCE},
+        {"Result manipulation",                          COMMAND_RESULT},
+        {"Taxonomy assignment",                          COMMAND_TAXONOMY},
+        {"Multi-hit search",                             COMMAND_MULTIHIT},
+        {"Prefiltering",                                 COMMAND_PREFILTER},
+        {"Alignment",                                    COMMAND_ALIGNMENT},
+        {"Clustering",                                   COMMAND_CLUSTER},
+        {"Profile databases",                            COMMAND_PROFILE},
+        {"Profile-profile databases",                    COMMAND_PROFILE_PROFILE},
+        {"Utility modules to manipulate DBs",            COMMAND_DB},
+        {"Special-purpose utilities",                    COMMAND_SPECIAL},
 };
 
 
@@ -21,11 +29,11 @@ std::vector<int> DbValidator::aaDb = {Parameters::DBTYPE_AMINO_ACIDS};
 std::vector<int> DbValidator::prefAlnResDb =  {Parameters::DBTYPE_ALIGNMENT_RES, Parameters::DBTYPE_PREFILTER_RES};
 std::vector<int> DbValidator::taxSequenceDb = {Parameters::DBTYPE_SEQTAXDB, Parameters::DBTYPE_INDEX_DB, Parameters::DBTYPE_NUCLEOTIDES,
                                                Parameters::DBTYPE_HMM_PROFILE, Parameters::DBTYPE_AMINO_ACIDS};
-std::vector<int> DbValidator::allDb = {Parameters::DBTYPE_SEQTAXDB, Parameters::DBTYPE_INDEX_DB, Parameters::DBTYPE_NUCLEOTIDES,
+std::vector<int> DbValidator::allDb = {Parameters::DBTYPE_SEQTAXDB, Parameters::DBTYPE_INDEX_DB, Parameters::DBTYPE_NUCLEOTIDES, Parameters::DBTYPE_MSA_DB,
                                       Parameters::DBTYPE_HMM_PROFILE, Parameters::DBTYPE_AMINO_ACIDS, Parameters::DBTYPE_ALIGNMENT_RES,
                                       Parameters::DBTYPE_PREFILTER_RES, Parameters::DBTYPE_PREFILTER_REV_RES, Parameters::DBTYPE_CLUSTER_RES,
                                       Parameters::DBTYPE_OFFSETDB, Parameters::DBTYPE_GENERIC_DB, Parameters::DBTYPE_TAXONOMICAL_RESULT};
-std::vector<int> DbValidator::allDbAndFlat = {Parameters::DBTYPE_SEQTAXDB, Parameters::DBTYPE_INDEX_DB, Parameters::DBTYPE_NUCLEOTIDES,
+std::vector<int> DbValidator::allDbAndFlat = {Parameters::DBTYPE_SEQTAXDB, Parameters::DBTYPE_INDEX_DB, Parameters::DBTYPE_NUCLEOTIDES, Parameters::DBTYPE_MSA_DB,
                                               Parameters::DBTYPE_HMM_PROFILE, Parameters::DBTYPE_AMINO_ACIDS, Parameters::DBTYPE_ALIGNMENT_RES,
                                               Parameters::DBTYPE_PREFILTER_RES, Parameters::DBTYPE_CLUSTER_RES, Parameters::DBTYPE_GENERIC_DB,
                                               Parameters::DBTYPE_FLATFILE};
@@ -42,4 +50,6 @@ std::vector<int> DbValidator::nuclAaDb = {Parameters::DBTYPE_NUCLEOTIDES, Parame
 std::vector<int> DbValidator::alignmentDb = {Parameters::DBTYPE_ALIGNMENT_RES};
 std::vector<int> DbValidator::directory = {Parameters::DBTYPE_DIRECTORY};
 std::vector<int> DbValidator::flatfile = {Parameters::DBTYPE_FLATFILE};
+std::vector<int> DbValidator::flatfileAndStdin = {Parameters::DBTYPE_FLATFILE, Parameters::DBTYPE_STDIN};
 std::vector<int> DbValidator::resultDb =  {Parameters::DBTYPE_ALIGNMENT_RES, Parameters::DBTYPE_PREFILTER_RES, Parameters::DBTYPE_PREFILTER_REV_RES, Parameters::DBTYPE_CLUSTER_RES};
+std::vector<int> DbValidator::empty = {};
diff --git a/src/commons/Command.h b/src/commons/Command.h
index b02a565..05cf07e 100644
--- a/src/commons/Command.h
+++ b/src/commons/Command.h
@@ -11,23 +11,33 @@ const unsigned int CITATION_PLASS    = 1 << 4;
 const unsigned int CITATION_SERVER   = 1 << 5;
 
 // Make sure this is always the last bit
-// citations from inheriting tools will start from here
+// citations from inheriting modules will start from here
 const unsigned int CITATION_END      = CITATION_SERVER << 1;
 
 struct MMseqsParameter;
 
-enum CommandMode {
-    COMMAND_MAIN = 0,
-    COMMAND_FORMAT_CONVERSION,
-    COMMAND_CLUSTER,
-    COMMAND_TAXONOMY,
-    COMMAND_MULTIHIT,
-    COMMAND_DB,
-    COMMAND_EXPERT,
-    COMMAND_SPECIAL,
-    COMMAND_HIDDEN,
-    COMMAND_EASY
-};
+typedef const unsigned int CommandMode;
+
+CommandMode COMMAND_MAIN              = 1 << 1;
+CommandMode COMMAND_FORMAT_CONVERSION = 1 << 2;
+CommandMode COMMAND_TAXONOMY          = 1 << 3;
+CommandMode COMMAND_MULTIHIT          = 1 << 4;
+CommandMode COMMAND_DB                = 1 << 5;
+CommandMode COMMAND_SPECIAL           = 1 << 6;
+CommandMode COMMAND_HIDDEN            = 1 << 7;
+CommandMode COMMAND_EASY              = 1 << 8;
+CommandMode COMMAND_DATABASE_CREATION = 1 << 9;
+CommandMode COMMAND_STORAGE           = 1 << 10;
+CommandMode COMMAND_SET               = 1 << 11;
+CommandMode COMMAND_SEQUENCE          = 1 << 12;
+CommandMode COMMAND_RESULT            = 1 << 13;
+CommandMode COMMAND_PREFILTER         = 1 << 14;
+CommandMode COMMAND_ALIGNMENT         = 1 << 15;
+CommandMode COMMAND_CLUSTER           = 1 << 16;
+CommandMode COMMAND_PROFILE           = 1 << 17;
+CommandMode COMMAND_PROFILE_PROFILE   = 1 << 18;
+
+CommandMode COMMAND_EXPERT            = 1 << 31;
 
 
 
@@ -53,6 +63,8 @@ struct DbValidator {
     static std::vector<int> taxResult;
     static std::vector<int> directory;
     static std::vector<int> flatfile;
+    static std::vector<int> flatfileAndStdin;
+    static std::vector<int> empty;
 };
 
 
@@ -64,6 +76,7 @@ struct DbType{
     static const int NEED_LOOKUP = 2;
     static const int NEED_TAXONOMY = 4;
     static const int VARIADIC = 8;
+    static const int ZERO_OR_ALL = 16;
 
     const char *usageText;
     int accessMode;
@@ -77,8 +90,8 @@ struct Command {
     int (*commandFunction)(int, const char **, const Command&);
     std::vector<MMseqsParameter*>* params;
     CommandMode mode;
-    const char *shortDescription;
-    const char *longDescription;
+    const char *description;
+    const char *examples;
     const char *author;
     const char *usage;
     unsigned int citations;
diff --git a/src/commons/DBConcat.cpp b/src/commons/DBConcat.cpp
index 9aa7af6..f7acb44 100644
--- a/src/commons/DBConcat.cpp
+++ b/src/commons/DBConcat.cpp
@@ -1,5 +1,6 @@
 #include "DBConcat.h"
 #include "DBWriter.h"
+#include "itoa.h"
 #include "Util.h"
 #include "Debug.h"
 #include "FileUtil.h"
@@ -14,42 +15,42 @@
 DBConcat::DBConcat(const std::string &dataFileNameA, const std::string &indexFileNameA,
                    const std::string &dataFileNameB, const std::string &indexFileNameB,
                    const std::string &dataFileNameC, const std::string &indexFileNameC,
-                   unsigned int threads, int dataMode, bool preserveKeysA, bool preserveKeysB, bool takeLargerEntry)
-        : DBReader((dataFileNameA == dataFileNameB ? dataFileNameA : dataFileNameC).c_str(),
-                   (indexFileNameA == indexFileNameB ? indexFileNameA : indexFileNameC).c_str(), threads, dataMode),
-          dataFileNameA(dataFileNameA), indexFileNameA(indexFileNameA),
-          dataFileNameB(dataFileNameB), indexFileNameB(indexFileNameB),
-          dataFileNameC(dataFileNameC), indexFileNameC(indexFileNameC),
-          threads(threads), preserveKeysA(preserveKeysA),preserveKeysB(preserveKeysB), takeLargerEntry(takeLargerEntry) {
+                   unsigned int threads, int dataMode, bool write, bool preserveKeysA, bool preserveKeysB, bool takeLargerEntry)
+        : DBReader((dataFileNameA == dataFileNameB ? dataFileNameA : dataFileNameC).c_str(), (indexFileNameA == indexFileNameB ? indexFileNameA : indexFileNameC).c_str(), threads, dataMode) {
     sameDatabase = dataFileNameA == dataFileNameB;
-}
-
-// If dbA != dbB, then Concatenate dbA and dbB in concatWriter ("dataFileNameC")
-// and "this" will be a reader on "dataFileNameC" after calling open()
-// otherwise, do nothing and "this"  will be a reader on "dataFileNameA"
-void DBConcat::concat(bool write) {
     if (sameDatabase) {
         return;
     }
 
-    DBReader<unsigned int> dbA(dataFileNameA.c_str(), indexFileNameA.c_str(), threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
-    DBReader<unsigned int> dbB(dataFileNameB.c_str(), indexFileNameB.c_str(), threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
-
-
+    DBReader<unsigned int> dbA(dataFileNameA.c_str(), indexFileNameA.c_str(), threads, dataMode);
+    DBReader<unsigned int> dbB(dataFileNameB.c_str(), indexFileNameB.c_str(), threads, dataMode);
     dbA.open(DBReader<unsigned int>::LINEAR_ACCCESS);
     dbB.open(DBReader<unsigned int>::LINEAR_ACCCESS);
 
     indexSizeA = dbA.getSize();
     indexSizeB = dbB.getSize();
 
-    // keys paris are like : (key,i) where key is the ith key in the ffindex
+    // keys paris are like : (key,i) where key is the ith key in the database
     keysA = new std::pair<unsigned int, unsigned int>[indexSizeA];
     keysB = new std::pair<unsigned int, unsigned int>[indexSizeB];
 
     DBWriter* concatWriter = NULL;
+    bool shouldConcatMapping = false;
+    bool shouldConcatLookup = false;
+    bool shouldConcatSource = false;
     if (write) {
         concatWriter = new DBWriter(dataFileNameC.c_str(), indexFileNameC.c_str(), threads, Parameters::WRITER_ASCII_MODE, dbA.getDbtype());
         concatWriter->open();
+
+        if (FileUtil::fileExists((dataFileNameA + "_mapping").c_str()) && FileUtil::fileExists((dataFileNameB + "_mapping").c_str())) {
+            shouldConcatMapping = true;
+        }
+        if (FileUtil::fileExists((dataFileNameA + ".lookup").c_str()) && FileUtil::fileExists((dataFileNameB + ".lookup").c_str())) {
+            shouldConcatLookup = true;
+        }
+        if (FileUtil::fileExists((dataFileNameA + ".source").c_str()) && FileUtil::fileExists((dataFileNameB + ".source").c_str())) {
+            shouldConcatSource = true;
+        }
     }
 
     Debug::Progress progress(indexSizeA);
@@ -139,6 +140,195 @@ void DBConcat::concat(bool write) {
     }
     dbA.close();
     dbB.close();
+
+
+    // handle mapping
+    if (shouldConcatMapping) {
+        char buffer[1024];
+        std::vector<std::pair<unsigned int, unsigned int>> mappingA;
+        Util::readMapping((dataFileNameA + "_mapping"), mappingA);
+        std::vector<std::pair<unsigned int, unsigned int>> mappingB;
+        Util::readMapping((dataFileNameB + "_mapping"), mappingB);
+
+        FILE* mappingFilePtr = fopen((dataFileNameC + "_mapping").c_str(), "w");
+
+        for(size_t i = 0; i < mappingA.size(); ++i) {
+            unsigned int prevKeyA = mappingA[i].first;
+            unsigned int taxidA = mappingA[i].second;
+            unsigned int newKeyA = dbAKeyMap(prevKeyA);
+
+            char * basePos = buffer;
+            char * tmpBuff = Itoa::u32toa_sse2(static_cast<uint32_t>(newKeyA), buffer);
+            *(tmpBuff-1) = '\t';
+            tmpBuff = Itoa::u32toa_sse2(static_cast<uint64_t>(taxidA), tmpBuff);;
+            *(tmpBuff-1) = '\n';
+            size_t length = tmpBuff - basePos;
+
+            int written = fwrite(buffer, sizeof(char), length, mappingFilePtr);
+            if (written != (int) length) {
+                Debug(Debug::ERROR) << "Cannot write to data file " << dataFileNameC << "_mapping\n";
+                EXIT(EXIT_FAILURE);
+            }
+        }
+
+        for(size_t i = 0; i < mappingB.size(); ++i) {
+            unsigned int prevKeyB = mappingB[i].first;
+            unsigned int taxidB = mappingB[i].second;
+            unsigned int newKeyB = dbBKeyMap(prevKeyB);
+
+            char * basePos = buffer;
+            char * tmpBuff = Itoa::u32toa_sse2(static_cast<uint32_t>(newKeyB), buffer);
+            *(tmpBuff-1) = '\t';
+            tmpBuff = Itoa::u32toa_sse2(static_cast<uint64_t>(taxidB), tmpBuff);;
+            *(tmpBuff-1) = '\n';
+            size_t length = tmpBuff - basePos;
+
+            int written = fwrite(buffer, sizeof(char), length, mappingFilePtr);
+            if (written != (int) length) {
+                Debug(Debug::ERROR) << "Cannot write to data file " << dataFileNameC << "_mapping\n";
+                EXIT(EXIT_FAILURE);
+            }
+        }
+        fclose (mappingFilePtr);
+    }
+
+    unsigned int maxSetIdA = 0;
+    // handle lookup
+    if (shouldConcatLookup) {
+        DBReader<unsigned int> lookupReaderA(dataFileNameA.c_str(), indexFileNameA.c_str(), 1, DBReader<unsigned int>::USE_LOOKUP);
+        lookupReaderA.open(DBReader<unsigned int>::NOSORT);
+        DBReader<unsigned int>::LookupEntry* lookupA = lookupReaderA.getLookup();
+
+        FILE* lookupFilePtr = fopen((dataFileNameC + ".lookup").c_str(), "w");
+
+        char buffer[1024];
+        std::string line;
+
+        for (size_t i = 0; i < lookupReaderA.getLookupSize(); ++i) {
+            unsigned int prevKeyA = lookupA[i].id;
+            std::string accA = lookupA[i].entryName;
+            unsigned int setIdA = lookupA[i].fileNumber;
+            if (setIdA > maxSetIdA) {
+                maxSetIdA = setIdA;
+            }
+
+            unsigned int newKeyA = dbAKeyMap(prevKeyA);
+
+            char *tmpBuff = Itoa::u32toa_sse2(static_cast<uint32_t>(newKeyA), buffer);
+            line.append(buffer, tmpBuff - buffer - 1);
+            line.append(1, '\t');
+            line.append(accA);
+            line.append(1, '\t');
+            tmpBuff = Itoa::u32toa_sse2(static_cast<uint64_t>(setIdA), buffer);
+            line.append(buffer, tmpBuff - buffer - 1);
+            line.append(1, '\n');
+            
+            int written = fwrite(line.c_str(), sizeof(char), line.size(), lookupFilePtr);
+            if (written != (int) line.size()) {
+                Debug(Debug::ERROR) << "Cannot write to data file " << dataFileNameC << ".lookup\n";
+                EXIT(EXIT_FAILURE);
+            }
+
+            line.clear();
+        }
+        lookupReaderA.close();
+
+        // for B we compute: newSetIdB = maxSetIdA + 1 + setIdB
+        DBReader<unsigned int> lookupReaderB(dataFileNameB.c_str(), indexFileNameB.c_str(), 1, DBReader<unsigned int>::USE_LOOKUP);
+        lookupReaderB.open(DBReader<unsigned int>::NOSORT);
+        DBReader<unsigned int>::LookupEntry* lookupB = lookupReaderB.getLookup();
+
+        for (size_t i = 0; i < lookupReaderB.getLookupSize(); ++i) {
+            unsigned int prevKeyB = lookupB[i].id;
+            std::string accB = lookupB[i].entryName;
+            unsigned int setIdB = lookupB[i].fileNumber;
+            
+            unsigned int newKeyB = dbBKeyMap(prevKeyB);
+            unsigned int newSetIdB = maxSetIdA + 1 + setIdB;
+
+            char *tmpBuff = Itoa::u32toa_sse2(static_cast<uint32_t>(newKeyB), buffer);
+            line.append(buffer, tmpBuff - buffer - 1);
+            line.append(1, '\t');
+            line.append(accB);
+            line.append(1, '\t');
+            tmpBuff = Itoa::u32toa_sse2(static_cast<uint64_t>(newSetIdB), buffer);
+            line.append(buffer, tmpBuff - buffer - 1);
+            line.append(1, '\n');
+            
+            int written = fwrite(line.c_str(), sizeof(char), line.size(), lookupFilePtr);
+            if (written != (int) line.size()) {
+                Debug(Debug::ERROR) << "Cannot write to data file " << dataFileNameC << ".lookup\n";
+                EXIT(EXIT_FAILURE);
+            }
+
+            line.clear();
+        }
+        lookupReaderB.close();
+        fclose (lookupFilePtr);
+    }
+
+    // handle source
+    if (shouldConcatSource) {
+        unsigned int sourceMaxSetIdA = 0;
+        std::map<unsigned int, std::string> sourceMapA = Util::readLookup((dataFileNameA + ".source"), false);
+        std::map<unsigned int, std::string>::iterator itA;
+        
+        char buffer[1024];
+        std::string line;
+
+        FILE* sourceFilePtr = fopen((dataFileNameC + ".source").c_str(), "w");
+
+        for (itA = sourceMapA.begin(); itA != sourceMapA.end(); itA++) {
+            unsigned int setIdA = itA->first;
+            std::string fileNameA = itA->second;
+            if (setIdA > sourceMaxSetIdA) {
+                sourceMaxSetIdA = setIdA;
+            }
+
+            char *tmpBuff = Itoa::u32toa_sse2(static_cast<uint64_t>(setIdA), buffer);
+            line.append(buffer, tmpBuff - buffer - 1);
+            line.append(1, '\t');
+            line.append(fileNameA);
+            line.append(1, '\n');
+
+            int written = fwrite(line.c_str(), sizeof(char), line.size(), sourceFilePtr);
+            if (written != (int) line.size()) {
+                Debug(Debug::ERROR) << "Cannot write to data file " << dataFileNameC << ".source\n";
+                EXIT(EXIT_FAILURE);
+            }
+            line.clear();
+        }
+        
+        // if lookup was concatenated - make sure maxSetId there is consistent with sourceMaxSetIdA
+        if (shouldConcatLookup && (sourceMaxSetIdA != maxSetIdA)) {
+            Debug(Debug::ERROR) << "The maxSetId in " << dataFileNameA << ".lookup is " << maxSetIdA << " and in " << dataFileNameA << ".source is " << sourceMaxSetIdA << "\n";
+            EXIT(EXIT_FAILURE);
+        }
+
+        std::map<unsigned int, std::string> sourceMapB = Util::readLookup((dataFileNameB + ".source"), false);
+        std::map<unsigned int, std::string>::iterator itB;
+
+        for (itB = sourceMapB.begin(); itB != sourceMapB.end(); itB++) {
+            unsigned int setIdB = itB->first;
+            std::string fileNameB = itB->second;
+
+            unsigned int newSetIdB = sourceMaxSetIdA + 1 + setIdB;
+
+            char *tmpBuff = Itoa::u32toa_sse2(static_cast<uint64_t>(newSetIdB), buffer);
+            line.append(buffer, tmpBuff - buffer - 1);
+            line.append(1, '\t');
+            line.append(fileNameB);
+            line.append(1, '\n');
+
+            int written = fwrite(line.c_str(), sizeof(char), line.size(), sourceFilePtr);
+            if (written != (int) line.size()) {
+                Debug(Debug::ERROR) << "Cannot write to data file " << dataFileNameC << ".source\n";
+                EXIT(EXIT_FAILURE);
+            }
+            line.clear();
+        }
+        fclose (sourceFilePtr);
+    }
 }
 
 unsigned int DBConcat::dbAKeyMap(unsigned int key) {
@@ -175,16 +365,14 @@ int concatdbs(int argc, const char **argv, const Command& command) {
     setDbConcatDefault(&par);
     par.parseParameters(argc, argv, command, true, 0, 0);
 
+    // TODO check equal db type
+
     int datamode = DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_INDEX;
     DBConcat outDB(par.db1.c_str(), par.db1Index.c_str(),
                    par.db2.c_str(), par.db2Index.c_str(),
                    par.db3.c_str(), par.db3Index.c_str(),
-                   static_cast<unsigned int>(par.threads), datamode, true, par.preserveKeysB, par.takeLargerEntry);
-    outDB.concat(true);
+                   static_cast<unsigned int>(par.threads), datamode, true, true, par.preserveKeysB, par.takeLargerEntry);
 
-    if (FileUtil::fileExists((par.db2 + ".dbtype").c_str())) {
-        FileUtil::copyFile((par.db2 + ".dbtype").c_str(), (par.db3 + ".dbtype").c_str());
-    }
 
     return EXIT_SUCCESS;
 }
diff --git a/src/commons/DBConcat.h b/src/commons/DBConcat.h
index 6971542..0edc78c 100644
--- a/src/commons/DBConcat.h
+++ b/src/commons/DBConcat.h
@@ -9,37 +9,22 @@ class DBConcat : public DBReader<unsigned int> {
     DBConcat(const std::string &dataFileNameA, const std::string &indexFileNameA,
              const std::string &dataFileNameB, const std::string &indexFileNameB,
              const std::string &dataFileNameC, const std::string &indexFileNameC,
-             unsigned int threads, int dataMode = USE_DATA | USE_INDEX,
+             unsigned int threads, int dataMode = USE_DATA | USE_INDEX | USE_LOOKUP, bool write = true,
              bool preserveKeysA = false, bool preserveKeysB = false, bool takeLargerEntry = false);
 
     ~DBConcat();
 
-    void concat(bool write = true);
-
     unsigned int dbAKeyMap(unsigned int);
     unsigned int dbBKeyMap(unsigned int);
 
 private:
-    std::string dataFileNameA;
-    std::string indexFileNameA;
-    std::string dataFileNameB;
-    std::string indexFileNameB;
-    std::string dataFileNameC;
-    std::string indexFileNameC;
-
     size_t indexSizeA;
     size_t indexSizeB;
 
     std::pair<unsigned int, unsigned int> *keysA, *keysB;
 
-    unsigned int threads;
-
     bool sameDatabase;
 
-    bool preserveKeysA; // do not change the keys of DBA
-    bool preserveKeysB; // do not change the keys of DBA
-    bool takeLargerEntry; // do not write empty entries
-
     struct compareFirstEntry {
         bool operator()(const std::pair<unsigned int, unsigned int> &lhs,
                         const std::pair<unsigned int, unsigned int> &rhs) const {
diff --git a/src/commons/DBReader.cpp b/src/commons/DBReader.cpp
index 01fbf54..e7ac92f 100644
--- a/src/commons/DBReader.cpp
+++ b/src/commons/DBReader.cpp
@@ -1,7 +1,5 @@
 #include "DBReader.h"
 
-#include <iostream>
-#include <fstream>
 #include <algorithm>
 #include <climits>
 #include <cstring>
@@ -19,6 +17,10 @@
 #include "FileUtil.h"
 #include "itoa.h"
 
+#ifdef OPENMP
+#include <omp.h>
+#endif
+
 template <typename T>
 DBReader<T>::DBReader(const char* dataFileName_, const char* indexFileName_, int threads, int dataMode) :
 threads(threads), dataMode(dataMode), dataFileName(strdup(dataFileName_)),
@@ -105,7 +107,7 @@ template <typename T> bool DBReader<T>::open(int accessType){
 
     if (dataMode & USE_DATA) {
         dataFileNames = FileUtil::findDatafiles(dataFileName);
-        if(dataFileNames.size() == 0){
+        if (dataFileNames.empty()) {
             Debug(Debug::ERROR) << "No datafile could be found for " << dataFileName << "!\n";
             EXIT(EXIT_FAILURE);
         }
@@ -433,6 +435,10 @@ template <typename T> void DBReader<T>::remapData(){
         unmapData();
         for(size_t fileIdx = 0; fileIdx < dataFileNames.size(); fileIdx++){
             FILE* dataFile = fopen(dataFileNames[fileIdx].c_str(), "r");
+            if (dataFile == NULL) {
+                Debug(Debug::ERROR) << "Can not open data file " << dataFileNames[fileIdx] << "!\n";
+                EXIT(EXIT_FAILURE);
+            }
             size_t dataSize = 0;
             dataFiles[fileIdx] = mmapData(dataFile, &dataSize);
             fclose(dataFile);
@@ -1023,6 +1029,10 @@ void DBReader<T>::removeDb(const std::string &databaseName){
     if (FileUtil::fileExists(dbTypeFile.c_str())) {
         FileUtil::remove(dbTypeFile.c_str());
     }
+    std::string sourceFile = databaseName + ".source";
+    if (FileUtil::fileExists(sourceFile.c_str())) {
+        FileUtil::remove(sourceFile.c_str());
+    }
     std::string lookupFile = databaseName + ".lookup";
     if (FileUtil::fileExists(lookupFile.c_str())) {
         FileUtil::remove(lookupFile.c_str());
diff --git a/src/commons/DBReader.h b/src/commons/DBReader.h
index a10b173..46c9f21 100644
--- a/src/commons/DBReader.h
+++ b/src/commons/DBReader.h
@@ -45,6 +45,7 @@ struct DBFiles {
         TAXONOMY          = TAX_MAPPING | TAX_NAMES | TAX_NODES | TAX_MERGED,
         SEQUENCE_DB       = GENERIC | HEADERS | TAXONOMY | LOOKUP | SOURCE,
         SEQUENCE_ANCILLARY= SEQUENCE_DB & (~GENERIC),
+        SEQUENCE_NO_DATA_INDEX = SEQUENCE_DB & (~DATA_INDEX),
 
         ALL               = (size_t) -1,
     };
diff --git a/src/commons/DBWriter.cpp b/src/commons/DBWriter.cpp
index cb19474..1e3eda3 100644
--- a/src/commons/DBWriter.cpp
+++ b/src/commons/DBWriter.cpp
@@ -666,11 +666,11 @@ void DBWriter::writeThreadBuffer(unsigned int idx, size_t dataSize) {
     }
 }
 
-void DBWriter::createRenumberedDB(const std::string& dataFile, const std::string& indexFile, const std::string& lookupFile, int sortMode) {
+void DBWriter::createRenumberedDB(const std::string& dataFile, const std::string& indexFile, const std::string& origData, const std::string& origIndex, int sortMode) {
     DBReader<unsigned int>* lookupReader = NULL;
     FILE *sLookup = NULL;
-    if (lookupFile.empty() == false) {
-        lookupReader = new DBReader<unsigned int>(lookupFile.c_str(), lookupFile.c_str(), 1, DBReader<unsigned int>::USE_LOOKUP);
+    if (origData.empty() == false && origIndex.empty() == false) {
+        lookupReader = new DBReader<unsigned int>(origData.c_str(), origIndex.c_str(), 1, DBReader<unsigned int>::USE_LOOKUP);
         lookupReader->open(DBReader<unsigned int>::NOSORT);
         sLookup = FileUtil::openAndDelete((dataFile + ".lookup").c_str(), "w");
     }
@@ -697,6 +697,7 @@ void DBWriter::createRenumberedDB(const std::string& dataFile, const std::string
             size_t lookupId = lookupReader->getLookupIdByKey(idx->id);
             DBReader<unsigned int>::LookupEntry copy = lookup[lookupId];
             copy.id = i;
+            copy.entryName = SSTR(idx->id);
             len = lookupReader->lookupEntryToBuffer(buffer, copy);
             written = fwrite(buffer, sizeof(char), len, sLookup);
             if (written != (int) len) {
diff --git a/src/commons/DBWriter.h b/src/commons/DBWriter.h
index 575faa6..b908277 100644
--- a/src/commons/DBWriter.h
+++ b/src/commons/DBWriter.h
@@ -1,12 +1,7 @@
 #ifndef DBWRITER_H
 #define DBWRITER_H
-
-// Written by Martin Steinegger & Maria Hauser mhauser@genzentrum.lmu.de
-//
-// Manages ffindex DB write access.
-// For parallel write access, one ffindex DB per thread is generated.
-// After the parallel calculation is done, all ffindexes are merged into one.
-//
+// For parallel write access, one each thread creates its own DB
+// After the parallel calculation are done, all DBs are merged into single DB
 
 #include <string>
 #include <vector>
@@ -67,7 +62,7 @@ class DBWriter {
     template <typename T>
     static void writeIndexEntryToFile(FILE *outFile, char *buff1, T &index);
 
-    static void createRenumberedDB(const std::string& dataFile, const std::string& indexFile, const std::string& lookupFile, int sortMode = DBReader<unsigned int>::SORT_BY_ID_OFFSET);
+    static void createRenumberedDB(const std::string& dataFile, const std::string& indexFile, const std::string& origData, const std::string& origIndex, int sortMode = DBReader<unsigned int>::SORT_BY_ID_OFFSET);
 
     bool isClosed(){
         return closed;
diff --git a/src/commons/ExpressionParser.cpp b/src/commons/ExpressionParser.cpp
index b845347..916da64 100644
--- a/src/commons/ExpressionParser.cpp
+++ b/src/commons/ExpressionParser.cpp
@@ -1,10 +1,14 @@
 #include "ExpressionParser.h"
+#include <set>
 #include <cstddef>
 
-ExpressionParser::ExpressionParser(const char* expression) {
+ExpressionParser::ExpressionParser(const char* expression) : ExpressionParser(expression, {}) {
+}
+
+ExpressionParser::ExpressionParser(const char* expression, const std::vector<te_variable>& lookup) {
 #define str2(s) #s
 #define str(s) str2(s)
-#define V(x) { "$" str(x), &variables[x - 1], TE_VARIABLE, NULL},
+#define V(x) { "$" str(x), &variables[((x) - 1)], TE_VARIABLE, NULL},
     vars = {
             V(1) V(2) V(3) V(4) V(5) V(6) V(7) V(8) V(9) V(10) V(11) V(12) V(13) V(14) V(15) V(16)
             V(17) V(18) V(19) V(20) V(21) V(22) V(23) V(24) V(25) V(26) V(27) V(28) V(29) V(30) V(31) V(32)
@@ -18,6 +22,7 @@ ExpressionParser::ExpressionParser(const char* expression) {
 #undef V
 #undef str
 #undef str2
+    vars.insert(vars.begin(), lookup.begin(), lookup.end());
     expr = te_compile(expression, vars.data(), vars.size(), &err);
 }
 
@@ -29,6 +34,8 @@ std::vector<int> ExpressionParser::findBindableIndices() {
     for (size_t i = 0; i < pointers.size(); ++i) {
         indices.emplace_back(pointers[i] - base);
     }
+    std::set<int> s(indices.begin(), indices.end());
+    indices.assign(s.begin(), s.end());
     return indices;
 }
 
diff --git a/src/commons/ExpressionParser.h b/src/commons/ExpressionParser.h
index 8737afe..39f4aad 100644
--- a/src/commons/ExpressionParser.h
+++ b/src/commons/ExpressionParser.h
@@ -7,6 +7,7 @@
 class ExpressionParser {
 public:
     ExpressionParser(const char* expression);
+    ExpressionParser(const char* expression, const std::vector<te_variable>& lookup);
 
     ~ExpressionParser() {
         if (expr) {
diff --git a/src/commons/FileUtil.cpp b/src/commons/FileUtil.cpp
index a83abb7..af81414 100644
--- a/src/commons/FileUtil.cpp
+++ b/src/commons/FileUtil.cpp
@@ -186,18 +186,48 @@ void FileUtil::symlinkAlias(const std::string &file, const std::string &alias) {
     if (symlinkExists(pathToAlias) == true){
         FileUtil::remove(pathToAlias.c_str());
     }
-
-    if (symlinkat(base.c_str(), dirfd(dir), alias.c_str()) != 0) {
+    // symlinkat is not available in Conda macOS
+    // Conda uses the macOS 10.9 SDK, and symlinkat was introduced in 10.10
+    // We emulate symlinkat by manipulating the CWD instead
+    std::string oldWd = FileUtil::getCurrentWorkingDirectory();
+    if (chdir(path.c_str()) != 0) {
+        Debug(Debug::ERROR) << "Could not change working directory to " << path << "\n";
+        EXIT(EXIT_FAILURE);
+    }
+    if (symlink(base.c_str(), alias.c_str()) != 0) {
         Debug(Debug::ERROR) << "Could not create symlink of " << file << "!\n";
         EXIT(EXIT_FAILURE);
     }
-
+    if (chdir(oldWd.c_str()) != 0) {
+        Debug(Debug::ERROR) << "Could not change working directory to " << oldWd << "\n";
+        EXIT(EXIT_FAILURE);
+    }
     if (closedir(dir) != 0) {
         Debug(Debug::ERROR) << "Error closing directory " << path << "!\n";
         EXIT(EXIT_FAILURE);
     }
 }
 
+std::string FileUtil::getCurrentWorkingDirectory() {
+    // CWD can be larger than PATH_MAX and allocating enough memory is somewhat tricky
+    char* wd = NULL;
+    size_t bufferSize = PATH_MAX;
+    do {
+        if (wd != NULL) {
+            free(wd);
+            bufferSize *= 2;
+        }
+        wd = getcwd(NULL, bufferSize);
+        if (wd == NULL && errno != ERANGE && errno != 0) {
+            Debug(Debug::ERROR) << "Could not get current working directory\n";
+            EXIT(EXIT_FAILURE);
+        }
+    } while (wd == NULL && errno == ERANGE);
+    std::string cwd(wd);
+    free(wd);
+    return cwd;
+}
+
 void FileUtil::symlinkAbs(const std::string &target, const std::string &link) {
     if(FileUtil::fileExists(link.c_str())){
         FileUtil::remove(link.c_str());
diff --git a/src/commons/FileUtil.h b/src/commons/FileUtil.h
index 8237ebf..8962d25 100644
--- a/src/commons/FileUtil.h
+++ b/src/commons/FileUtil.h
@@ -20,7 +20,7 @@ class FileUtil {
 
     static size_t countLines(const char *name);
 
-    static bool makeDir(const char *dirName, const int mode = 0700);
+    static bool makeDir(const char *dirName, const int mode = 0777);
 
     static void deleteTempFiles(const std::list<std::string> &tmpFiles);
 
@@ -40,6 +40,8 @@ class FileUtil {
 
     static size_t getFreeSpace(const char *dir);
 
+    static std::string getCurrentWorkingDirectory();
+
     static void symlinkAlias(const std::string &file, const std::string &alias);
     static void symlinkAbs(const std::string &target, const std::string &link);
 
diff --git a/src/commons/KSeqWrapper.cpp b/src/commons/KSeqWrapper.cpp
index 4407175..28fb982 100644
--- a/src/commons/KSeqWrapper.cpp
+++ b/src/commons/KSeqWrapper.cpp
@@ -19,7 +19,8 @@ bool KSeqFile::ReadEntry() {
     int result = KSEQFILE::kseq_read(s);
     if (result < 0)
         return false;
-    entry.offset = s->offset;
+    entry.headerOffset = s->headerOffset;
+    entry.sequenceOffset = s->sequenceOffset;
     entry.multiline = s->multiline;
     entry.name = s->name;
     entry.comment = s->comment;
@@ -34,6 +35,33 @@ KSeqFile::~KSeqFile() {
     fclose(file);
 }
 
+
+namespace KSEQSTREAM {
+    KSEQ_INIT(int, read)
+}
+
+KSeqStream::KSeqStream() {
+    seq = (void*) KSEQSTREAM::kseq_init(STDIN_FILENO);
+}
+
+bool KSeqStream::ReadEntry() {
+    KSEQSTREAM::kseq_t* s = (KSEQSTREAM::kseq_t*) seq;
+    int result = KSEQSTREAM::kseq_read(s);
+    if (result < 0)
+        return false;
+
+    entry.name = s->name;
+    entry.comment = s->comment;
+    entry.sequence = s->seq;
+    entry.qual = s->qual;
+
+    return true;
+}
+
+KSeqStream::~KSeqStream() {
+    kseq_destroy((KSEQSTREAM::kseq_t*)seq);
+}
+
 #ifdef HAVE_ZLIB
 namespace KSEQGZIP {
     KSEQ_INIT(gzFile, gzread)
@@ -64,7 +92,8 @@ bool KSeqGzip::ReadEntry() {
     entry.comment = s->comment;
     entry.sequence = s->seq;
     entry.qual = s->qual;
-    entry.offset = -1;
+    entry.headerOffset = -1;
+    entry.sequenceOffset = -1;
     entry.multiline = s->multiline;
 
     return true;
@@ -108,7 +137,8 @@ bool KSeqBzip::ReadEntry() {
     entry.comment = s->comment;
     entry.sequence = s->seq;
     entry.qual = s->qual;
-    entry.offset = -1;
+    entry.headerOffset = -1;
+    entry.sequenceOffset = -1;
     entry.multiline = s->multiline;
 
     return true;
@@ -123,12 +153,19 @@ KSeqBzip::~KSeqBzip() {
 
 KSeqWrapper* KSeqFactory(const char* file) {
     KSeqWrapper* kseq = NULL;
+    if( strcmp(file, "stdin") == 0 ){
+        kseq = new KSeqStream();
+        return kseq;
+    }
+
     if(Util::endsWith(".gz", file) == false && Util::endsWith(".bz2", file) == false ) {
         kseq = new KSeqFile(file);
+        return kseq;
     }
 #ifdef HAVE_ZLIB
     else if(Util::endsWith(".gz", file) == true) {
         kseq = new KSeqGzip(file);
+        return kseq;
     }
 #else
     else if(Util::endsWith(".gz", file) == true) {
@@ -140,6 +177,7 @@ KSeqWrapper* KSeqFactory(const char* file) {
 #ifdef HAVE_BZLIB
     else if(Util::endsWith(".bz2", file) == true) {
         kseq = new KSeqBzip(file);
+        return kseq;
     }
 #else
     else if(Util::endsWith(".bz2", file) == true) {
diff --git a/src/commons/KSeqWrapper.h b/src/commons/KSeqWrapper.h
index 7a56525..3e32699 100644
--- a/src/commons/KSeqWrapper.h
+++ b/src/commons/KSeqWrapper.h
@@ -12,7 +12,8 @@ class KSeqWrapper {
         kstring_t sequence;
         kstring_t comment;
         kstring_t qual;
-        size_t offset;
+        size_t headerOffset;
+        size_t sequenceOffset;
         bool multiline;
     } entry;
 
@@ -32,6 +33,14 @@ class KSeqFile : public KSeqWrapper {
     FILE* file;
 };
 
+
+class KSeqStream : public KSeqWrapper {
+public:
+    KSeqStream();
+    bool ReadEntry();
+    ~KSeqStream();
+};
+
 #ifdef HAVE_ZLIB
 #include <zlib.h>
 
diff --git a/src/commons/MultiParam.cpp b/src/commons/MultiParam.cpp
new file mode 100644
index 0000000..d3b8029
--- /dev/null
+++ b/src/commons/MultiParam.cpp
@@ -0,0 +1,112 @@
+#include "MultiParam.h"
+#include <stdio.h>
+#include <cfloat>
+
+#include "Util.h"
+
+template <typename T>
+MultiParam<T>::MultiParam(T aminoacids, T nucleotides) {
+    this->nucleotides = nucleotides;
+    this->aminoacids = aminoacids;
+}
+
+template <typename T>
+std::string MultiParam<T>::format(const MultiParam<T> &multiparam) {
+    if (multiparam.nucleotides == multiparam.aminoacids) {
+        return SSTR(multiparam.nucleotides);
+    } else {
+        return std::string("nucl:") + SSTR(multiparam.nucleotides) + ",aa:" + SSTR(multiparam.aminoacids);
+    }
+}
+
+template<>
+MultiParam<int>::MultiParam(const char* parametercstring) {
+    if (strchr(parametercstring, ',') != NULL) {
+        if (sscanf(parametercstring, "aa:%d,nucl:%d", &aminoacids, &nucleotides) != 2 &&
+            sscanf(parametercstring, "nucl:%d,aa:%d", &nucleotides, &aminoacids) != 2) {
+            nucleotides = INT_MAX;
+            aminoacids = INT_MAX;
+        }
+    } else {
+
+        if (sscanf(parametercstring, "%d", &aminoacids) != 1) {
+            nucleotides = INT_MAX;
+            aminoacids = INT_MAX;
+        }
+        else
+            nucleotides = aminoacids;
+    }
+}
+
+template<>
+MultiParam<float>::MultiParam(const char* parametercstring) {
+    if (strchr(parametercstring, ',') != NULL) {
+        if (sscanf(parametercstring, "aa:%f,nucl:%f", &aminoacids, &nucleotides) != 2 &&
+            sscanf(parametercstring, "nucl:%f,aa:%f", &nucleotides, &aminoacids) != 2) {
+            nucleotides = FLT_MAX;
+            aminoacids = FLT_MAX;
+        }
+    } else {
+
+        if (sscanf(parametercstring, "%f", &aminoacids) != 1) {
+            nucleotides = FLT_MAX;
+            aminoacids = FLT_MAX;
+        }
+        else
+            nucleotides = aminoacids;
+    }
+}
+
+
+template class MultiParam<int>;
+template class MultiParam<float>;
+
+/* explicit implementation for MultiParam<char*> */
+
+MultiParam<char*>::MultiParam(const char*  aminoacids, const char*  nucleotides) {
+    this->nucleotides = strdup(nucleotides);
+    this->aminoacids = strdup(aminoacids);
+}
+
+MultiParam<char*>::MultiParam(const char* filename) {
+    if (strchr(filename, ',') != NULL) {
+        size_t len = strlen(filename);
+        aminoacids = (char*) malloc(len * sizeof(char));
+        nucleotides = (char*) malloc(len * sizeof(char));
+        if (sscanf(filename, "aa:%[^,],nucl:%s", aminoacids, nucleotides) != 2 && sscanf(filename, "nucl:%[^,],aa:%s", nucleotides, aminoacids) != 2) {
+            free((char*)nucleotides);
+            free((char*)aminoacids);
+            nucleotides = strdup("INVALID");
+            aminoacids = strdup("INVALID");
+        }
+    } else {
+        nucleotides = strdup(filename);
+        aminoacids = strdup(filename);
+    }
+}
+
+MultiParam<char*>::~MultiParam() {
+    free(nucleotides);
+    free(aminoacids);
+}
+
+std::string MultiParam<char*>::format(const MultiParam<char*> &file) {
+    /*if (strncmp(file.nucleotides, file.aminoacids, strlen(file.aminoacids)) == 0) {
+        return file.nucleotides;
+    } else {*/
+        return std::string("nucl:") + file.nucleotides + ",aa:" + file.aminoacids;
+    //}
+}
+
+
+bool MultiParam<char*>::operator==(const char* other) const {
+    return strncmp(other, nucleotides, strlen(nucleotides)) == 0 || strncmp(other, aminoacids, strlen(aminoacids)) == 0;
+}
+
+bool MultiParam<char*>::operator==(const std::string& other) const {
+    return strncmp(other.c_str(), nucleotides, strlen(nucleotides)) == 0 || strncmp(other.c_str(), aminoacids, strlen(aminoacids)) == 0;
+}
+
+bool MultiParam<char*>::operator==(const MultiParam<char*>& other) const {
+    return strncmp(other.nucleotides, nucleotides, strlen(nucleotides)) == 0 && strncmp(other.aminoacids, aminoacids, strlen(aminoacids)) == 0;
+}
diff --git a/src/commons/MultiParam.h b/src/commons/MultiParam.h
new file mode 100644
index 0000000..0ed60f5
--- /dev/null
+++ b/src/commons/MultiParam.h
@@ -0,0 +1,89 @@
+#ifndef MULTIPARAM_H
+#define MULTIPARAM_H
+
+/*
+ * MultiParam: class to store sequence type specific parameter values
+ * written by Annika Seidel <annika.seidel@mpibpc.mpg.de>
+ */
+
+#include <string>
+#include <cstring>
+#include <limits.h>
+#include <stdlib.h>
+
+template <class T>
+class MultiParam {
+
+public:
+    T aminoacids;
+    T nucleotides;
+    MultiParam(){};
+    MultiParam(T aminoacids, T nucleotides);
+    MultiParam(const char* parametercstring);
+    ~MultiParam(){};
+
+    static std::string format(const MultiParam<T> &multiparam);
+
+    MultiParam<T>& operator=(T value) {
+        nucleotides = value;
+        aminoacids = value;
+        return *this;
+    }
+};
+
+template <>
+class MultiParam<char*> {
+
+public:
+    char* aminoacids;
+    char* nucleotides;
+
+    MultiParam(const char* aminoacids, const char* nucleotides);
+    MultiParam(const char* parametercstring);
+    MultiParam(const MultiParam<char*> & par) {
+        nucleotides = strdup(par.nucleotides);
+        aminoacids = strdup(par.aminoacids);
+    };
+
+    explicit MultiParam<char*>(const std::string& parameterstring) : MultiParam<char*>(parameterstring.c_str()) {}
+    ~MultiParam();
+
+    static std::string format(const MultiParam<char*> &multiparam);
+    //MultiParam<char>& operator=(char* value);
+
+
+
+    //ScoreMatrixFile(const ScoreMatrixFile& copy) : ScoreMatrixFile(copy.aminoacids, copy.nucleotides) {}
+
+    MultiParam<char*>& operator=(const MultiParam<char*>& other) {
+        if (nucleotides != NULL) {
+            free(nucleotides);
+        }
+        if (aminoacids != NULL) {
+            free(aminoacids);
+        }
+        nucleotides = strdup(other.nucleotides);
+        aminoacids = strdup(other.aminoacids);
+        return *this;
+    }
+
+
+    bool operator==(const char* other) const;
+    bool operator==(const std::string& other) const;
+    bool operator==(const MultiParam<char*>& other) const;
+
+    bool operator!=(const char* other) const {
+        return !(operator==(other));
+    }
+    bool operator!=(const std::string& other) const {
+        return !(operator==(other));
+    }
+    bool operator!=(const MultiParam<char*>& other) const {
+        return !(operator==(other));
+    }
+
+
+};
+
+
+#endif
diff --git a/src/commons/NucleotideMatrix.cpp b/src/commons/NucleotideMatrix.cpp
index a4f9220..ca44818 100644
--- a/src/commons/NucleotideMatrix.cpp
+++ b/src/commons/NucleotideMatrix.cpp
@@ -6,11 +6,11 @@ NucleotideMatrix::NucleotideMatrix(const char* scoringMatrixFileName, float bitF
     setupLetterMapping();
     reverseLookup = new int[alphabetSize];
     // TODO think about making the matrix dynamic
-    reverseLookup[aa2int[static_cast<int>('A')]] = aa2int[static_cast<int>('T')];
-    reverseLookup[aa2int[static_cast<int>('G')]] = aa2int[static_cast<int>('C')];
-    reverseLookup[aa2int[static_cast<int>('C')]] = aa2int[static_cast<int>('G')];
-    reverseLookup[aa2int[static_cast<int>('T')]] = aa2int[static_cast<int>('A')];
-    reverseLookup[aa2int[static_cast<int>('X')]] = aa2int[static_cast<int>('X')];
+    reverseLookup[aa2num[static_cast<int>('A')]] = aa2num[static_cast<int>('T')];
+    reverseLookup[aa2num[static_cast<int>('G')]] = aa2num[static_cast<int>('C')];
+    reverseLookup[aa2num[static_cast<int>('C')]] = aa2num[static_cast<int>('G')];
+    reverseLookup[aa2num[static_cast<int>('T')]] = aa2num[static_cast<int>('A')];
+    reverseLookup[aa2num[static_cast<int>('X')]] = aa2num[static_cast<int>('X')];
 }
 
 
@@ -34,11 +34,11 @@ void NucleotideMatrix::setupLetterMapping(){
             case 'T':
             case 'G':
             case 'C':
-                this->aa2int[static_cast<int>(letter)] = this->aa2int[static_cast<int>(upperLetter)];
+                this->aa2num[static_cast<int>(letter)] = this->aa2num[static_cast<int>(upperLetter)];
                 break;
             case 'U':
             case 'W':
-                this->aa2int[static_cast<int>(letter)] = this->aa2int[static_cast<int>('T')];
+                this->aa2num[static_cast<int>(letter)] = this->aa2num[static_cast<int>('T')];
                 break;
             case 'K':
             case 'B':
@@ -46,15 +46,15 @@ void NucleotideMatrix::setupLetterMapping(){
             case 'V':
             case 'R':
             case 'S':
-                this->aa2int[static_cast<int>(letter)] = this->aa2int[static_cast<int>('G')];
+                this->aa2num[static_cast<int>(letter)] = this->aa2num[static_cast<int>('G')];
                 break;
             case 'M':
             case 'Y':
             case 'H':
-                this->aa2int[static_cast<int>(letter)] = this->aa2int[static_cast<int>('C')];
+                this->aa2num[static_cast<int>(letter)] = this->aa2num[static_cast<int>('C')];
                 break;
             default:
-                this->aa2int[static_cast<int>(letter)] = this->aa2int[(int)'X'];
+                this->aa2num[static_cast<int>(letter)] = this->aa2num[static_cast<int>('X')];
                 break;
         }
     }
diff --git a/src/commons/Orf.cpp b/src/commons/Orf.cpp
index 1a9fc44..6a57851 100644
--- a/src/commons/Orf.cpp
+++ b/src/commons/Orf.cpp
@@ -202,23 +202,19 @@ template <int N>
 #ifndef AVX2
 inline bool isInCodons(const char* sequence, simd_int codons, simd_int codons2) {
 #else
-    inline bool isInCodons(const char* sequence, simd_int codons, simd_int) {
+inline bool isInCodons(const char* sequence, simd_int codons, simd_int) {
 #endif
-    simd_int c = simdi_loadu((simd_int*)sequence);
-    // ATGA ATGA ATGA ATGA
-#ifdef AVX2
-    simd_int shuf = _mm256_permutevar8x32_epi32(c, _mm256_setzero_si256());
-#else
-    simd_int shuf = simdi32_shuffle(c, _MM_SHUFFLE(0, 0, 0, 0));
-#endif
-    // ATG0 ATG0 ATG0 ATG0
+    // s: ATGA GTGA TGAT GAGT
+    // c: ATGA ATGA ATGA ATGA
+    simd_int c = simdi32_set(*(unsigned int*)sequence);
     simd_int mask = simdi32_set(0x00FFFFFF);
-    shuf = simdi_and(mask, shuf);
-    // FFFF 0000 0000 0000
-    simd_int test = simdi32_eq(shuf, codons);
+    // c: ATG0 ATG0 ATG0 ATG0
+    c = simdi_and(mask, c);
+    // t: FFFF 0000 0000 0000
+    simd_int test = simdi32_eq(c, codons);
 #ifndef AVX2
-    if(N > 4) {
-        simd_int test2 = simdi32_eq(shuf, codons2);
+    if (N > 4) {
+        simd_int test2 = simdi32_eq(c, codons2);
         return (simdi8_movemask(test) != 0) && (simdi8_movemask(test2) != 0);
     }
 #endif
diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp
index 2f1d0d0..d4336a2 100644
--- a/src/commons/Parameters.cpp
+++ b/src/commons/Parameters.cpp
@@ -27,234 +27,237 @@ extern const char* version;
 Parameters::Parameters():
         scoringMatrixFile("INVALID", "INVALID"),
         seedScoringMatrixFile("INVALID", "INVALID"),
-        PARAM_S(PARAM_S_ID,"-s", "Sensitivity","sensitivity: 1.0 faster; 4.0 fast default; 7.5 sensitive (range 1.0-7.5)", typeid(float), (void *) &sensitivity, "^[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PREFILTER),
-        PARAM_K(PARAM_K_ID,"-k", "K-mer size", "k-mer size in the range (0: set automatically to optimum)",typeid(int),  (void *) &kmerSize, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PREFILTER|MMseqsParameter::COMMAND_CLUSTLINEAR|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_THREADS(PARAM_THREADS_ID,"--threads", "Threads", "number of cores used for the computation (uses all cores by default)",typeid(int), (void *) &threads, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_COMMON),
-        PARAM_COMPRESSED(PARAM_COMPRESSED_ID,"--compressed", "Compressed", "write results in compressed format",typeid(int), (void *) &compressed, "^[0-1]{1}$", MMseqsParameter::COMMAND_COMMON),
-        PARAM_ALPH_SIZE(PARAM_ALPH_SIZE_ID,"--alph-size", "Alphabet size", "alphabet size (range 2-21)",typeid(int),(void *) &alphabetSize, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PREFILTER|MMseqsParameter::COMMAND_CLUSTLINEAR|MMseqsParameter::COMMAND_EXPERT),
-        // Regex for Range 1-32768
-        // Please do not change manually, use a tool to regenerate
-        // e.g.: http://gamon.webfactional.com/regexnumericrangegenerator/
-        PARAM_MAX_SEQ_LEN(PARAM_MAX_SEQ_LEN_ID,"--max-seq-len","Max sequence length", "maximum sequence length (range 1-32768])",typeid(int), (void *) &maxSeqLen, "^[0-9]{1}[0-9]*", MMseqsParameter::COMMAND_COMMON|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_DIAGONAL_SCORING(PARAM_DIAGONAL_SCORING_ID,"--diag-score", "Diagonal scoring", "Use ungapped diagonal scoring during prefilter", typeid(bool), (void *) &diagonalScoring, "", MMseqsParameter::COMMAND_PREFILTER|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_EXACT_KMER_MATCHING(PARAM_EXACT_KMER_MATCHING_ID,"--exact-kmer-matching", "Exact k-mer matching", "only exact k-mer matching (range 0-1)", typeid(int),(void *) &exactKmerMatching, "^[0-1]{1}$", MMseqsParameter::COMMAND_PREFILTER|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_MASK_RESIDUES(PARAM_MASK_RESIDUES_ID,"--mask", "Mask residues", "mask sequences in k-mer stage 0: w/o low complexity masking, 1: with low complexity masking", typeid(int),(void *) &maskMode, "^[0-1]{1}", MMseqsParameter::COMMAND_PREFILTER|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_MASK_LOWER_CASE(PARAM_MASK_LOWER_CASE_ID,"--mask-lower-case", "Mask lower case residues", "lowercase letters will be excluded from k-mer search 0: include region, 1: exclude region", typeid(int),(void *) &maskLowerCaseMode, "^[0-1]{1}", MMseqsParameter::COMMAND_PREFILTER|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_MIN_DIAG_SCORE(PARAM_MIN_DIAG_SCORE_ID,"--min-ungapped-score", "Minimum diagonal score", "accept only matches with ungapped alignment score above this threshold", typeid(int),(void *) &minDiagScoreThr, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PREFILTER|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_K_SCORE(PARAM_K_SCORE_ID,"--k-score", "K-score", "K-mer threshold for generating similar k-mer lists",typeid(int),(void *) &kmerScore,  "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PREFILTER|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_MAX_SEQS(PARAM_MAX_SEQS_ID,"--max-seqs", "Max results per query", "Maximum result sequences per query allowed to pass the prefilter (this parameter affects sensitivity)",typeid(int),(void *) &maxResListLen, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PREFILTER),
-        PARAM_SPLIT(PARAM_SPLIT_ID,"--split", "Split database", "Splits input sets into N equally distributed chunks. The default value sets the best split automatically. createindex can only be used with split 1.",typeid(int),(void *) &split,  "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PREFILTER|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_SPLIT_MODE(PARAM_SPLIT_MODE_ID,"--split-mode", "Split mode", "0: split target db; 1: split query db;  2: auto, depending on main memory",typeid(int),(void *) &splitMode,  "^[0-2]{1}$", MMseqsParameter::COMMAND_PREFILTER|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_SPLIT_MEMORY_LIMIT(PARAM_SPLIT_MEMORY_LIMIT_ID, "--split-memory-limit", "Split memory limit", "Set max memory per split. E.g. 800B, 5K, 10M, 1G. Defaults (0) to all available system memory.", typeid(ByteParser), (void*) &splitMemoryLimit, "^(0|[1-9]{1}[0-9]*(B|K|M|G|T)?)$", MMseqsParameter::COMMAND_COMMON|MMseqsParameter::COMMAND_PREFILTER|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_DISK_SPACE_LIMIT(PARAM_DISK_SPACE_LIMIT_ID, "--disk-space-limit", "Disk space limit", "Set max disk space to use for reverse profile searches. E.g. 800B, 5K, 10M, 1G. Defaults (0) to all available disk space in the temp folder.", typeid(ByteParser), (void*) &diskSpaceLimit, "^(0|[1-9]{1}[0-9]*(B|K|M|G|T)?)$", MMseqsParameter::COMMAND_COMMON|MMseqsParameter::COMMAND_PREFILTER|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_SPLIT_AMINOACID(PARAM_SPLIT_AMINOACID_ID,"--split-aa", "Split by amino acid","Try to find the best split for the target database by amino acid count instead",typeid(bool), (void *) &splitAA, "$", MMseqsParameter::COMMAND_EXPERT),
-        PARAM_SUB_MAT(PARAM_SUB_MAT_ID,"--sub-mat", "Substitution matrix", "amino acid substitution matrix file",typeid(ScoreMatrixFile),(void *) &scoringMatrixFile, "", MMseqsParameter::COMMAND_COMMON|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_SEED_SUB_MAT(PARAM_SEED_SUB_MAT_ID,"--seed-sub-mat", "Seed substitution matrix", "amino acid substitution matrix for kmer generation file",typeid(ScoreMatrixFile),(void *) &seedScoringMatrixFile, "", MMseqsParameter::COMMAND_PREFILTER|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_NO_COMP_BIAS_CORR(PARAM_NO_COMP_BIAS_CORR_ID,"--comp-bias-corr", "Compositional bias","correct for locally biased amino acid composition (range 0-1)",typeid(int), (void *) &compBiasCorrection, "^[0-1]{1}$", MMseqsParameter::COMMAND_PREFILTER|MMseqsParameter::COMMAND_ALIGN|MMseqsParameter::COMMAND_PROFILE|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_SPACED_KMER_MODE(PARAM_SPACED_KMER_MODE_ID,"--spaced-kmer-mode", "Spaced k-mers", "0: use consecutive positions a k-mers; 1: use spaced k-mers",typeid(int), (void *) &spacedKmer,  "^[0-1]{1}", MMseqsParameter::COMMAND_PREFILTER|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_REMOVE_TMP_FILES(PARAM_REMOVE_TMP_FILES_ID, "--remove-tmp-files", "Remove temporary files" , "Delete temporary files", typeid(bool), (void *) &removeTmpFiles, "",MMseqsParameter::COMMAND_MISC|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_INCLUDE_IDENTITY(PARAM_INCLUDE_IDENTITY_ID,"--add-self-matches", "Include identical seq. id.","artificially add entries of queries with themselves (for clustering)",typeid(bool), (void *) &includeIdentity, "", MMseqsParameter::COMMAND_PREFILTER|MMseqsParameter::COMMAND_ALIGN|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_PRELOAD_MODE(PARAM_PRELOAD_MODE_ID, "--db-load-mode", "Preload mode", "Database preload mode 0: auto, 1: fread, 2: mmap, 3: mmap+touch", typeid(int), (void*) &preloadMode, "[0-3]{1}", MMseqsParameter::COMMAND_COMMON|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_SPACED_KMER_PATTERN(PARAM_SPACED_KMER_PATTERN_ID, "--spaced-kmer-pattern", "Spaced k-mer pattern", "User-specified spaced k-mer pattern", typeid(std::string), (void *) &spacedKmerPattern, "^1[01]*1$", MMseqsParameter::COMMAND_PREFILTER|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_LOCAL_TMP(PARAM_LOCAL_TMP_ID, "--local-tmp", "Local temporary path", "Path where some of the temporary files will be created", typeid(std::string), (void *) &localTmp, "", MMseqsParameter::COMMAND_PREFILTER|MMseqsParameter::COMMAND_EXPERT),
+        alphabetSize(INT_MAX,INT_MAX),
+        PARAM_S(PARAM_S_ID, "-s", "Sensitivity", "Sensitivity: 1.0 faster; 4.0 fast; 7.5 sensitive", typeid(float), (void *) &sensitivity, "^[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PREFILTER),
+        PARAM_K(PARAM_K_ID, "-k", "k-mer length", "k-mer length (0: automatically set to optimum)", typeid(int), (void *) &kmerSize, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_THREADS(PARAM_THREADS_ID, "--threads", "Threads", "Number of CPU-cores used (all by default)", typeid(int), (void *) &threads, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_COMMON),
+        PARAM_COMPRESSED(PARAM_COMPRESSED_ID, "--compressed", "Compressed", "Write compressed output", typeid(int), (void *) &compressed, "^[0-1]{1}$", MMseqsParameter::COMMAND_COMMON),
+        PARAM_ALPH_SIZE(PARAM_ALPH_SIZE_ID, "--alph-size", "Alphabet size", "Alphabet size (range 2-21)", typeid(MultiParam<int>), (void *) &alphabetSize, "", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_MAX_SEQ_LEN(PARAM_MAX_SEQ_LEN_ID, "--max-seq-len", "Max sequence length", "Maximum sequence length", typeid(int), (void *) &maxSeqLen, "^[0-9]{1}[0-9]*", MMseqsParameter::COMMAND_COMMON | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_DIAGONAL_SCORING(PARAM_DIAGONAL_SCORING_ID, "--diag-score", "Diagonal scoring", "Use ungapped diagonal scoring during prefilter", typeid(bool), (void *) &diagonalScoring, "", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_EXACT_KMER_MATCHING(PARAM_EXACT_KMER_MATCHING_ID, "--exact-kmer-matching", "Exact k-mer matching", "Extract only exact k-mers for matching (range 0-1)", typeid(int), (void *) &exactKmerMatching, "^[0-1]{1}$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_MASK_RESIDUES(PARAM_MASK_RESIDUES_ID, "--mask", "Mask residues", "Mask sequences in k-mer stage: 0: w/o low complexity masking, 1: with low complexity masking", typeid(int), (void *) &maskMode, "^[0-1]{1}", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_MASK_LOWER_CASE(PARAM_MASK_LOWER_CASE_ID, "--mask-lower-case", "Mask lower case residues", "Lowercase letters will be excluded from k-mer search 0: include region, 1: exclude region", typeid(int), (void *) &maskLowerCaseMode, "^[0-1]{1}", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_MIN_DIAG_SCORE(PARAM_MIN_DIAG_SCORE_ID, "--min-ungapped-score", "Minimum diagonal score", "Accept only matches with ungapped alignment score above threshold", typeid(int), (void *) &minDiagScoreThr, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_K_SCORE(PARAM_K_SCORE_ID, "--k-score", "k-score", "k-mer threshold for generating similar k-mer lists", typeid(int), (void *) &kmerScore, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_MAX_SEQS(PARAM_MAX_SEQS_ID, "--max-seqs", "Max results per query", "Maximum results per query sequence allowed to pass the prefilter (affects sensitivity)", typeid(int), (void *) &maxResListLen, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PREFILTER),
+        PARAM_SPLIT(PARAM_SPLIT_ID, "--split", "Split database", "Split input into N equally distributed chunks. 0: set the best split automatically", typeid(int), (void *) &split, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_SPLIT_MODE(PARAM_SPLIT_MODE_ID, "--split-mode", "Split mode", "0: split target db; 1: split query db; 2: auto, depending on main memory", typeid(int), (void *) &splitMode, "^[0-2]{1}$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_SPLIT_MEMORY_LIMIT(PARAM_SPLIT_MEMORY_LIMIT_ID, "--split-memory-limit", "Split memory limit", "Set max memory per split. E.g. 800B, 5K, 10M, 1G. Default (0) to all available system memory", typeid(ByteParser), (void *) &splitMemoryLimit, "^(0|[1-9]{1}[0-9]*(B|K|M|G|T)?)$", MMseqsParameter::COMMAND_COMMON | MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_DISK_SPACE_LIMIT(PARAM_DISK_SPACE_LIMIT_ID, "--disk-space-limit", "Disk space limit", "Set max disk space to use for reverse profile searches. E.g. 800B, 5K, 10M, 1G. Default (0) to all available disk space in the temp folder", typeid(ByteParser), (void *) &diskSpaceLimit, "^(0|[1-9]{1}[0-9]*(B|K|M|G|T)?)$", MMseqsParameter::COMMAND_COMMON | MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_SPLIT_AMINOACID(PARAM_SPLIT_AMINOACID_ID, "--split-aa", "Split by amino acid", "Try to find the best split boundaries by entry lengths", typeid(bool), (void *) &splitAA, "$", MMseqsParameter::COMMAND_EXPERT),
+        PARAM_SUB_MAT(PARAM_SUB_MAT_ID, "--sub-mat", "Substitution matrix", "Substitution matrix file", typeid(MultiParam<char*>), (void *) &scoringMatrixFile, "", MMseqsParameter::COMMAND_COMMON | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_SEED_SUB_MAT(PARAM_SEED_SUB_MAT_ID, "--seed-sub-mat", "Seed substitution matrix", "Substitution matrix file for k-mer generation", typeid(MultiParam<char*>), (void *) &seedScoringMatrixFile, "", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_NO_COMP_BIAS_CORR(PARAM_NO_COMP_BIAS_CORR_ID, "--comp-bias-corr", "Compositional bias", "Correct for locally biased amino acid composition (range 0-1)", typeid(int), (void *) &compBiasCorrection, "^[0-1]{1}$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_SPACED_KMER_MODE(PARAM_SPACED_KMER_MODE_ID, "--spaced-kmer-mode", "Spaced k-mers", "0: use consecutive positions in k-mers; 1: use spaced k-mers", typeid(int), (void *) &spacedKmer, "^[0-1]{1}", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_REMOVE_TMP_FILES(PARAM_REMOVE_TMP_FILES_ID, "--remove-tmp-files", "Remove temporary files", "Delete temporary files", typeid(bool), (void *) &removeTmpFiles, "", MMseqsParameter::COMMAND_COMMON | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_INCLUDE_IDENTITY(PARAM_INCLUDE_IDENTITY_ID, "--add-self-matches", "Include identical seq. id.", "Artificially add entries of queries with themselves (for clustering)", typeid(bool), (void *) &includeIdentity, "", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_PRELOAD_MODE(PARAM_PRELOAD_MODE_ID, "--db-load-mode", "Preload mode", "Database preload mode 0: auto, 1: fread, 2: mmap, 3: mmap+touch", typeid(int), (void *) &preloadMode, "[0-3]{1}", MMseqsParameter::COMMAND_COMMON | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_SPACED_KMER_PATTERN(PARAM_SPACED_KMER_PATTERN_ID, "--spaced-kmer-pattern", "Spaced k-mer pattern", "User-specified spaced k-mer pattern", typeid(std::string), (void *) &spacedKmerPattern, "^1[01]*1$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_LOCAL_TMP(PARAM_LOCAL_TMP_ID, "--local-tmp", "Local temporary path", "Path where some of the temporary files will be created", typeid(std::string), (void *) &localTmp, "", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_EXPERT),
         // alignment
-        PARAM_ALIGNMENT_MODE(PARAM_ALIGNMENT_MODE_ID,"--alignment-mode", "Alignment mode", "How to compute the alignment: 0: automatic; 1: only score and end_pos; 2: also start_pos and cov; 3: also seq.id; 4: only ungapped alignment",typeid(int), (void *) &alignmentMode, "^[0-4]{1}$", MMseqsParameter::COMMAND_ALIGN|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_E(PARAM_E_ID,"-e", "E-value threshold", "list matches below this E-value (range 0.0-inf)",typeid(float), (void *) &evalThr, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_ALIGN),
-        PARAM_C(PARAM_C_ID,"-c", "Coverage threshold", "list matches above this fraction of aligned (covered) residues (see --cov-mode)",typeid(float), (void *) &covThr, "^0(\\.[0-9]+)?|^1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN| MMseqsParameter::COMMAND_CLUSTLINEAR),
-        PARAM_COV_MODE(PARAM_COV_MODE_ID, "--cov-mode", "Coverage mode", "0: coverage of query and target, 1: coverage of target, 2: coverage of query 3: target seq. length needs to be at least x% of query length, 4: query seq. length needs to be at least x% of target length 5: short seq. needs to be at least x% of the other seq. length", typeid(int), (void *) &covMode, "^[0-5]{1}$", MMseqsParameter::COMMAND_ALIGN),
+        PARAM_ALIGNMENT_MODE(PARAM_ALIGNMENT_MODE_ID, "--alignment-mode", "Alignment mode", "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id\n4: only ungapped alignment", typeid(int), (void *) &alignmentMode, "^[0-4]{1}$", MMseqsParameter::COMMAND_ALIGN),
+        PARAM_E(PARAM_E_ID, "-e", "E-value threshold", "List matches below this E-value (range 0.0-inf)", typeid(float), (void *) &evalThr, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_ALIGN),
+        PARAM_C(PARAM_C_ID, "-c", "Coverage threshold", "List matches above this fraction of aligned (covered) residues (see --cov-mode)", typeid(float), (void *) &covThr, "^0(\\.[0-9]+)?|^1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_CLUSTLINEAR),
+        PARAM_COV_MODE(PARAM_COV_MODE_ID, "--cov-mode", "Coverage mode", "0: coverage of query and target\n1: coverage of target\n2: coverage of query\n3: target seq. length has to be at least x% of query length\n4: query seq. length has to be at least x% of target length\n5: short seq. needs to be at least x% of the other seq. length", typeid(int), (void *) &covMode, "^[0-5]{1}$", MMseqsParameter::COMMAND_ALIGN),
         PARAM_SEQ_ID_MODE(PARAM_SEQ_ID_MODE_ID, "--seq-id-mode", "Seq. id. mode", "0: alignment length 1: shorter, 2: longer sequence", typeid(int), (void *) &seqIdMode, "^[0-2]{1}$", MMseqsParameter::COMMAND_ALIGN),
-        PARAM_MAX_REJECTED(PARAM_MAX_REJECTED_ID,"--max-rejected", "Max reject", "maximum rejected alignments before alignment calculation for a query is aborted",typeid(int),(void *) &maxRejected, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN),
-        PARAM_MAX_ACCEPT(PARAM_MAX_ACCEPT_ID,"--max-accept", "Max accept", "maximum accepted alignments before alignment calculation for a query is stopped",typeid(int),(void *) &maxAccept, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN),
-        PARAM_ADD_BACKTRACE(PARAM_ADD_BACKTRACE_ID, "-a", "Add backtrace", "add backtrace string (convert to alignments with mmseqs convertalis utility)", typeid(bool), (void *) &addBacktrace, "", MMseqsParameter::COMMAND_ALIGN),
-        PARAM_REALIGN(PARAM_REALIGN_ID, "--realign", "Realign hits", "compute more conservative, shorter alignments (scores and E-values not changed)", typeid(bool), (void *) &realign, "", MMseqsParameter::COMMAND_ALIGN|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_MIN_SEQ_ID(PARAM_MIN_SEQ_ID_ID,"--min-seq-id", "Seq. id. threshold","list matches above this sequence identity (for clustering) (range 0.0-1.0)",typeid(float), (void *) &seqIdThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN),
-        PARAM_MIN_ALN_LEN(PARAM_MIN_ALN_LEN_ID,"--min-aln-len", "Min. alignment length","minimum alignment length (range 0-INT_MAX)",typeid(int), (void *) &alnLenThr, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN),
-        PARAM_SCORE_BIAS(PARAM_SCORE_BIAS_ID,"--score-bias", "Score bias", "Score bias when computing the SW alignment (in bits)",typeid(float), (void *) &scoreBias, "^-?[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_ALIGN|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_ALT_ALIGNMENT(PARAM_ALT_ALIGNMENT_ID,"--alt-ali", "Alternative alignments","Show up to this many alternative alignments",typeid(int), (void *) &altAlignment, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN),
-        PARAM_GAP_OPEN(PARAM_GAP_OPEN_ID,"--gap-open", "Gap open cost","Gap open cost",typeid(int), (void *) &gapOpen, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_GAP_EXTEND(PARAM_GAP_EXTEND_ID,"--gap-extend", "Gap extension cost","Gap extension cost",typeid(int), (void *) &gapExtend, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN|MMseqsParameter::COMMAND_EXPERT),
+        PARAM_MAX_REJECTED(PARAM_MAX_REJECTED_ID, "--max-rejected", "Max reject", "Maximum rejected alignments before alignment calculation for a query is stopped", typeid(int), (void *) &maxRejected, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN),
+        PARAM_MAX_ACCEPT(PARAM_MAX_ACCEPT_ID, "--max-accept", "Max accept", "Maximum accepted alignments before alignment calculation for a query is stopped", typeid(int), (void *) &maxAccept, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN),
+        PARAM_ADD_BACKTRACE(PARAM_ADD_BACKTRACE_ID, "-a", "Add backtrace", "Add backtrace string (convert to alignments with mmseqs convertalis module)", typeid(bool), (void *) &addBacktrace, "", MMseqsParameter::COMMAND_ALIGN),
+        PARAM_REALIGN(PARAM_REALIGN_ID, "--realign", "Realign hits", "Compute more conservative, shorter alignments (scores and E-values not changed)", typeid(bool), (void *) &realign, "", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_MIN_SEQ_ID(PARAM_MIN_SEQ_ID_ID, "--min-seq-id", "Seq. id. threshold", "List matches above this sequence identity (for clustering) (range 0.0-1.0)", typeid(float), (void *) &seqIdThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_ALIGN),
+        PARAM_MIN_ALN_LEN(PARAM_MIN_ALN_LEN_ID, "--min-aln-len", "Min alignment length", "Minimum alignment length (range 0-INT_MAX)", typeid(int), (void *) &alnLenThr, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN),
+        PARAM_SCORE_BIAS(PARAM_SCORE_BIAS_ID, "--score-bias", "Score bias", "Score bias when computing SW alignment (in bits)", typeid(float), (void *) &scoreBias, "^-?[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_ALT_ALIGNMENT(PARAM_ALT_ALIGNMENT_ID, "--alt-ali", "Alternative alignments", "Show up to this many alternative alignments", typeid(int), (void *) &altAlignment, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN),
+        PARAM_GAP_OPEN(PARAM_GAP_OPEN_ID, "--gap-open", "Gap open cost", "Gap open cost", typeid(MultiParam<int>), (void *) &gapOpen, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_GAP_EXTEND(PARAM_GAP_EXTEND_ID, "--gap-extend", "Gap extension cost", "Gap extension cost", typeid(MultiParam<int>), (void *) &gapExtend, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_ZDROP(PARAM_ZDROP_ID, "--zdrop", "Zdrop", "Maximal allowed difference between score values before alignment is truncated  (nucleotide alignment only)", typeid(int), (void*) &zdrop, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT),
         // clustering
-        PARAM_CLUSTER_MODE(PARAM_CLUSTER_MODE_ID,"--cluster-mode", "Cluster mode", "0: Setcover, 1: connected component, 2: Greedy clustering by sequence length  3: Greedy clustering by sequence length (low mem)",typeid(int), (void *) &clusteringMode, "[0-3]{1}$", MMseqsParameter::COMMAND_CLUST),
-        PARAM_CLUSTER_STEPS(PARAM_CLUSTER_STEPS_ID,"--cluster-steps", "Cascaded clustering steps", "cascaded clustering steps from 1 to -s",typeid(int), (void *) &clusterSteps, "^[1-9]{1}$", MMseqsParameter::COMMAND_CLUST|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_CASCADED(PARAM_CASCADED_ID,"--single-step-clustering", "Single step clustering", "switches from cascaded to simple clustering workflow",typeid(bool), (void *) &cascaded, "", MMseqsParameter::COMMAND_CLUST),
-        PARAM_CLUSTER_REASSIGN(PARAM_CLUSTER_REASSIGN_ID,"--cluster-reassign", "Cluster reassign", "cascaded clustering can cluster sequence that not fulfill the clustering criteria. Cluster reassignment corrects this errors", typeid(int), (void *) &clusterReassignment, "[0-1]{1}$", MMseqsParameter::COMMAND_CLUST),
+        PARAM_CLUSTER_MODE(PARAM_CLUSTER_MODE_ID, "--cluster-mode", "Cluster mode", "0: Set-Cover (greedy)\n1: Connected component (BLASTclust)\n2,3: Greedy clustering by sequence length (CDHIT)", typeid(int), (void *) &clusteringMode, "[0-3]{1}$", MMseqsParameter::COMMAND_CLUST),
+        PARAM_CLUSTER_STEPS(PARAM_CLUSTER_STEPS_ID, "--cluster-steps", "Cascaded clustering steps", "Cascaded clustering steps from 1 to -s", typeid(int), (void *) &clusterSteps, "^[1-9]{1}$", MMseqsParameter::COMMAND_CLUST | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_CASCADED(PARAM_CASCADED_ID, "--single-step-clustering", "Single step clustering", "Switch from cascaded to simple clustering workflow", typeid(bool), (void *) &singleStepClustering, "", MMseqsParameter::COMMAND_CLUST),
+        PARAM_CLUSTER_REASSIGN(PARAM_CLUSTER_REASSIGN_ID, "--cluster-reassign", "Cluster reassign", "Cascaded clustering can cluster sequence that do not fulfill the clustering criteria.\nCluster reassignment corrects these errors", typeid(int), (void *) &clusterReassignment, "[0-1]{1}$", MMseqsParameter::COMMAND_CLUST),
         // affinity clustering
-        PARAM_MAXITERATIONS(PARAM_MAXITERATIONS_ID,"--max-iterations", "Max depth connected component", "maximum depth of breadth first search in connected component",typeid(int), (void *) &maxIteration,  "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_CLUST|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_SIMILARITYSCORE(PARAM_SIMILARITYSCORE_ID,"--similarity-type", "Similarity type", "type of score used for clustering (range 1,2). 1=alignment score. 2=sequence identity ",typeid(int),(void *) &similarityScoreType,  "^[1-2]{1}$", MMseqsParameter::COMMAND_CLUST|MMseqsParameter::COMMAND_EXPERT),
+        PARAM_MAXITERATIONS(PARAM_MAXITERATIONS_ID, "--max-iterations", "Max connected component depth", "Maximum depth of breadth first search in connected component clustering", typeid(int), (void *) &maxIteration, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_CLUST | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_SIMILARITYSCORE(PARAM_SIMILARITYSCORE_ID, "--similarity-type", "Similarity type", "Type of score used for clustering. 1: alignment score 2: sequence identity", typeid(int), (void *) &similarityScoreType, "^[1-2]{1}$", MMseqsParameter::COMMAND_CLUST | MMseqsParameter::COMMAND_EXPERT),
         // logging
-        PARAM_V(PARAM_V_ID,"-v", "Verbosity","verbosity level: 0=nothing, 1: +errors, 2: +warnings, 3: +info",typeid(int), (void *) &verbosity, "^[0-3]{1}$", MMseqsParameter::COMMAND_COMMON),
-        // create profile (HMM)
-        PARAM_PROFILE_TYPE(PARAM_PROFILE_TYPE_ID,"--profile-type", "Profile type", "0: HMM (HHsuite) 1: PSSM or 2: HMMER3",typeid(int),(void *) &profileMode,  "^[0-2]{1}$"),
+        PARAM_V(PARAM_V_ID, "-v", "Verbosity", "Verbosity level: 0: quiet, 1: +errors, 2: +warnings, 3: +info", typeid(int), (void *) &verbosity, "^[0-3]{1}$", MMseqsParameter::COMMAND_COMMON),
         // convertalignments
-        PARAM_FORMAT_MODE(PARAM_FORMAT_MODE_ID,"--format-mode", "Alignment format", "Output format 0: BLAST-TAB, 1: SAM, 2: BLAST-TAB + query/db length", typeid(int), (void*) &formatAlignmentMode, "^[0-2]{1}$"),
-        PARAM_FORMAT_OUTPUT(PARAM_FORMAT_OUTPUT_ID,"--format-output", "Format alignment output", "Choose output columns 'query,target,evalue,gapopen,pident,nident,qstart,qend,qlen,tstart,tend,tlen,alnlen,raw,bits,cigar,qseq,tseq,qheader,theader,qaln,taln,qframe,tframe,mismatch,qcov,tcov,qset,qsetid,tset,tsetid,taxid,taxname,taxlineage'", typeid(std::string), (void*) &outfmt, ""),
-        PARAM_DB_OUTPUT(PARAM_DB_OUTPUT_ID, "--db-output", "Database output", "Output a result db instead of a text file", typeid(bool), (void*) &dbOut, "", MMseqsParameter::COMMAND_EXPERT),
+        PARAM_FORMAT_MODE(PARAM_FORMAT_MODE_ID, "--format-mode", "Alignment format", "Output format: 0: BLAST-TAB, 1: SAM, 2: BLAST-TAB + query/db length", typeid(int), (void *) &formatAlignmentMode, "^[0-2]{1}$"),
+        PARAM_FORMAT_OUTPUT(PARAM_FORMAT_OUTPUT_ID, "--format-output", "Format alignment output", "Choose comma separated list of output columns from: query,target,evalue,gapopen,pident,nident,qstart,qend,qlen\ntstart,tend,tlen,alnlen,raw,bits,cigar,qseq,tseq,qheader,theader,qaln,taln,qframe,tframe,mismatch,qcov,tcov\nqset,qsetid,tset,tsetid,taxid,taxname,taxlineage", typeid(std::string), (void *) &outfmt, ""),
+        PARAM_DB_OUTPUT(PARAM_DB_OUTPUT_ID, "--db-output", "Database output", "Return a result DB instead of a text file", typeid(bool), (void *) &dbOut, "", MMseqsParameter::COMMAND_EXPERT),
         // --include-only-extendablediagonal
-        PARAM_RESCORE_MODE(PARAM_RESCORE_MODE_ID,"--rescore-mode", "Rescore mode", "Rescore diagonal with: 0: Hamming distance, 1: local alignment (score only), 2: local alignment, 3: global alignment or 4: longest alignment fullfilling window quality criterion", typeid(int), (void *) &rescoreMode, "^[0-4]{1}$"),
-        PARAM_WRAPPED_SCORING(PARAM_WRAPPED_SCORING_ID,"--wrapped-scoring", "Allow wrapped scoring","Double the (nucleotide) query sequence during the scoring process to allow wrapped diagonal scoring around end and start", typeid(bool), (void *) &wrappedScoring, "", MMseqsParameter::COMMAND_MISC|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_FILTER_HITS(PARAM_FILTER_HITS_ID,"--filter-hits", "Remove hits by seq. id. and coverage", "filter hits by seq.id. and coverage", typeid(bool), (void *) &filterHits, "", MMseqsParameter::COMMAND_EXPERT),
-        PARAM_SORT_RESULTS(PARAM_SORT_RESULTS_ID, "--sort-results", "Sort results", "Sort results: 0: no sorting, 1: sort by evalue (Alignment) or seq.id. (Hamming)", typeid(int), (void *) &sortResults, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT),
+        PARAM_RESCORE_MODE(PARAM_RESCORE_MODE_ID, "--rescore-mode", "Rescore mode", "Rescore diagonals with:\n0: Hamming distance\n1: local alignment (score only)\n2: local alignment\n3: global alignment\n4: longest alignment fullfilling window quality criterion", typeid(int), (void *) &rescoreMode, "^[0-4]{1}$"),
+        PARAM_WRAPPED_SCORING(PARAM_WRAPPED_SCORING_ID, "--wrapped-scoring", "Allow wrapped scoring", "Double the (nucleotide) query sequence during the scoring process to allow wrapped diagonal scoring around end and start", typeid(bool), (void *) &wrappedScoring, "", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_FILTER_HITS(PARAM_FILTER_HITS_ID, "--filter-hits", "Remove hits by seq. id. and coverage", "Filter hits by seq.id. and coverage", typeid(bool), (void *) &filterHits, "", MMseqsParameter::COMMAND_EXPERT),
+        PARAM_SORT_RESULTS(PARAM_SORT_RESULTS_ID, "--sort-results", "Sort results", "Sort results: 0: no sorting, 1: sort by E-value (Alignment) or seq.id. (Hamming)", typeid(int), (void *) &sortResults, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT),
         // result2msa
-        PARAM_ALLOW_DELETION(PARAM_ALLOW_DELETION_ID,"--allow-deletion", "Allow deletions", "allow deletions in a MSA", typeid(bool), (void*) &allowDeletion, ""),
-        PARAM_ADD_INTERNAL_ID(PARAM_ADD_INTERNAL_ID_ID,"--add-iternal-id", "Add internal id.", "add internal id as comment to MSA", typeid(bool), (void*) &addInternalId, "",  MMseqsParameter::COMMAND_EXPERT),
-        PARAM_COMPRESS_MSA(PARAM_COMPRESS_MSA_ID,"--compress", "Compress MSA", "create MSA in ca3m format", typeid(bool), (void*) &compressMSA, ""),
-        PARAM_SUMMARIZE_HEADER(PARAM_SUMMARIZE_HEADER_ID,"--summarize", "Summarize headers", "summarize cluster headers into a single header description", typeid(bool), (void*) &summarizeHeader, ""),
-        PARAM_SUMMARY_PREFIX(PARAM_SUMMARY_PREFIX_ID, "--summary-prefix", "Summary prefix","sets the cluster summary prefix",typeid(std::string),(void *) &summaryPrefix, "", MMseqsParameter::COMMAND_EXPERT),
-        PARAM_OMIT_CONSENSUS(PARAM_OMIT_CONSENSUS_ID, "--omit-consensus", "Omit consensus", "Omit consensus sequence in alignment", typeid(bool), (void*) &omitConsensus, "", MMseqsParameter::COMMAND_EXPERT),
-        PARAM_SKIP_QUERY(PARAM_SKIP_QUERY_ID, "--skip-query", "Skip query", "Skip the query sequence", typeid(bool), (void*) &skipQuery, "", MMseqsParameter::COMMAND_EXPERT),
+        PARAM_ALLOW_DELETION(PARAM_ALLOW_DELETION_ID, "--allow-deletion", "Allow deletions", "Allow deletions in a MSA", typeid(bool), (void *) &allowDeletion, ""),
+        PARAM_ADD_INTERNAL_ID(PARAM_ADD_INTERNAL_ID_ID, "--add-iternal-id", "Add internal ID", "Add internal id as comment to MSA", typeid(bool), (void *) &addInternalId, "", MMseqsParameter::COMMAND_EXPERT),
+        PARAM_COMPRESS_MSA(PARAM_COMPRESS_MSA_ID, "--compress", "Compress MSA", "Create MSA in CA3M format", typeid(bool), (void *) &compressMSA, ""),
+        PARAM_SUMMARIZE_HEADER(PARAM_SUMMARIZE_HEADER_ID, "--summarize", "Summarize headers", "Summarize cluster headers into a single header description", typeid(bool), (void *) &summarizeHeader, ""),
+        PARAM_SUMMARY_PREFIX(PARAM_SUMMARY_PREFIX_ID, "--summary-prefix", "Summary prefix", "Set the cluster summary prefix", typeid(std::string), (void *) &summaryPrefix, "", MMseqsParameter::COMMAND_EXPERT),
+        PARAM_OMIT_CONSENSUS(PARAM_OMIT_CONSENSUS_ID, "--omit-consensus", "Omit consensus", "Omit consensus sequence in alignment", typeid(bool), (void *) &omitConsensus, "", MMseqsParameter::COMMAND_EXPERT),
+        PARAM_SKIP_QUERY(PARAM_SKIP_QUERY_ID, "--skip-query", "Skip query", "Skip the query sequence", typeid(bool), (void *) &skipQuery, "", MMseqsParameter::COMMAND_EXPERT),
         // convertmsa
-        PARAM_IDENTIFIER_FIELD(PARAM_IDENTIFIER_FIELD_ID, "--identifier-field", "Identifier field", "Field from STOCKHOLM comments for choosing the MSA identifier: 0: ID, 1: AC. If the respective comment does not exist, the name of the first sequence will become the identifier.", typeid(int), (void*) &identifierField, "^[0-1]{1}$", MMseqsParameter::COMMAND_COMMON),
+        PARAM_IDENTIFIER_FIELD(PARAM_IDENTIFIER_FIELD_ID, "--identifier-field", "Identifier field", "Field from STOCKHOLM comments for choosing the MSA identifier: 0: ID, 1: AC. If the respective comment does not exist, the name of the first sequence will become the identifier", typeid(int), (void *) &identifierField, "^[0-1]{1}$", MMseqsParameter::COMMAND_COMMON),
         // msa2profile
-        PARAM_MATCH_MODE(PARAM_MATCH_MODE_ID, "--match-mode", "Match mode", "0: Columns that have a residue in the first sequence are kept, 1: columns that have a residue in --match-ratio of all sequences are kept.", typeid(int), (void*)&matchMode, "^(0|1)$", MMseqsParameter::COMMAND_PROFILE),
-        PARAM_MATCH_RATIO(PARAM_MATCH_RATIO_ID, "--match-ratio", "Match ratio", "columns that have a residue in this ratio of all sequences are kept", typeid(float), (void*)&matchRatio, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_PROFILE),
+        PARAM_MATCH_MODE(PARAM_MATCH_MODE_ID, "--match-mode", "Match mode", "0: Columns that have a residue in the first sequence are kept, 1: columns that have a residue in --match-ratio of all sequences are kept", typeid(int), (void *) &matchMode, "^(0|1)$", MMseqsParameter::COMMAND_PROFILE),
+        PARAM_MATCH_RATIO(PARAM_MATCH_RATIO_ID, "--match-ratio", "Match ratio", "Columns that have a residue in this ratio of all sequences are kept", typeid(float), (void *) &matchRatio, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_PROFILE),
         // result2profile
-        PARAM_MASK_PROFILE(PARAM_MASK_PROFILE_ID,"--mask-profile", "Mask profile", "mask query sequence of profile using tantan [0,1]", typeid(int),(void *) &maskProfile, "^[0-1]{1}$", MMseqsParameter::COMMAND_PROFILE |MMseqsParameter::COMMAND_EXPERT),
-        PARAM_E_PROFILE(PARAM_E_PROFILE_ID,"--e-profile", "Profile e-value threshold", "includes sequences matches with < e-value thr. into the profile (>=0.0)", typeid(float), (void *) &evalProfile, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|([0-9]*(\\.[0-9]+)?)$", MMseqsParameter::COMMAND_PROFILE),
-        PARAM_FILTER_MSA(PARAM_FILTER_MSA_ID,"--filter-msa", "Filter MSA", "filter msa: 0: do not filter, 1: filter", typeid(int), (void*) &filterMsa, "^[0-1]{1}$", MMseqsParameter::COMMAND_PROFILE|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_FILTER_MAX_SEQ_ID(PARAM_FILTER_MAX_SEQ_ID_ID,"--max-seq-id", "Maximum seq. id. threshold", "reduce redundancy of output MSA using max. pairwise sequence identity [0.0,1.0]", typeid(float), (void*) &filterMaxSeqId, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_PROFILE|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_FILTER_QSC(PARAM_FILTER_QSC_ID, "--qsc", "Minimum score per column", "reduce diversity of output MSAs using min. score per aligned residue with query sequences [-50.0,100.0]", typeid(float), (void*) &qsc, "^\\-*[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PROFILE|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_FILTER_QID(PARAM_FILTER_QID_ID, "--qid", "Minimum seq. id.", "reduce diversity of output MSAs using min.seq. identity with query sequences [0.0,1.0]", typeid(float), (void*) &qid, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_PROFILE|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_FILTER_COV(PARAM_FILTER_COV_ID, "--cov", "Minimum coverage", "filter output MSAs using min. fraction of query residues covered by matched sequences [0.0,1.0]", typeid(float), (void*) &covMSAThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_PROFILE|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_FILTER_NDIFF(PARAM_FILTER_NDIFF_ID, "--diff", "Select N most diverse seqs", "filter MSAs by selecting most diverse set of sequences, keeping at least this many seqs in each MSA block of length 50", typeid(int), (void*) &Ndiff, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PROFILE|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_WG(PARAM_WG_ID, "--wg", "Use global sequence weighting", "use global sequence weighting for profile calculation", typeid(bool), (void*) &wg, "", MMseqsParameter::COMMAND_PROFILE|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_PCA(PARAM_PCA_ID, "--pca", "Pseudo count a", "pseudo count admixture strength", typeid(float), (void*) &pca, "^[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PROFILE|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_PCB(PARAM_PCB_ID, "--pcb", "Pseudo count b", "pseudo counts: Neff at half of maximum admixture (range 0.0-inf)", typeid(float), (void*) &pcb, "^[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PROFILE|MMseqsParameter::COMMAND_EXPERT),
+        PARAM_MASK_PROFILE(PARAM_MASK_PROFILE_ID, "--mask-profile", "Mask profile", "Mask query sequence of profile using tantan [0,1]", typeid(int), (void *) &maskProfile, "^[0-1]{1}$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_E_PROFILE(PARAM_E_PROFILE_ID, "--e-profile", "Profile e-value threshold", "Include sequences matches with < e-value thr. into the profile (>=0.0)", typeid(float), (void *) &evalProfile, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|([0-9]*(\\.[0-9]+)?)$", MMseqsParameter::COMMAND_PROFILE),
+        PARAM_FILTER_MSA(PARAM_FILTER_MSA_ID, "--filter-msa", "Filter MSA", "Filter msa: 0: do not filter, 1: filter", typeid(int), (void *) &filterMsa, "^[0-1]{1}$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_FILTER_MAX_SEQ_ID(PARAM_FILTER_MAX_SEQ_ID_ID, "--max-seq-id", "Maximum seq. id. threshold", "Reduce redundancy of output MSA using max. pairwise sequence identity [0.0,1.0]", typeid(float), (void *) &filterMaxSeqId, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_FILTER_QSC(PARAM_FILTER_QSC_ID, "--qsc", "Minimum score per column", "Reduce diversity of output MSAs using min. score per aligned residue with query sequences [-50.0,100.0]", typeid(float), (void *) &qsc, "^\\-*[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_FILTER_QID(PARAM_FILTER_QID_ID, "--qid", "Minimum seq. id.", "Reduce diversity of output MSAs using min.seq. identity with query sequences [0.0,1.0]", typeid(float), (void *) &qid, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_FILTER_COV(PARAM_FILTER_COV_ID, "--cov", "Minimum coverage", "Filter output MSAs using min. fraction of query residues covered by matched sequences [0.0,1.0]", typeid(float), (void *) &covMSAThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_FILTER_NDIFF(PARAM_FILTER_NDIFF_ID, "--diff", "Select N most diverse seqs", "Filter MSAs by selecting most diverse set of sequences, keeping at least this many seqs in each MSA block of length 50", typeid(int), (void *) &Ndiff, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_WG(PARAM_WG_ID, "--wg", "Global sequence weighting", "Use global sequence weighting for profile calculation", typeid(bool), (void *) &wg, "", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_PCA(PARAM_PCA_ID, "--pca", "Pseudo count a", "Pseudo count admixture strength", typeid(float), (void *) &pca, "^[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_PCB(PARAM_PCB_ID, "--pcb", "Pseudo count b", "Pseudo counts: Neff at half of maximum admixture (range 0.0-inf)", typeid(float), (void *) &pcb, "^[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
         // sequence2profile
-        PARAM_NEFF(PARAM_NEFF_ID, "--neff", "Neff", "Neff included into context state profile (1.0,20.0)", typeid(float), (void*) &neff, "^[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PROFILE),
-        PARAM_TAU(PARAM_TAU_ID, "--tau", "Tau", "Tau: context state pseudo count mixture (0.0,1.0)", typeid(float), (void*) &tau, "[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PROFILE),
+        PARAM_NEFF(PARAM_NEFF_ID, "--neff", "Neff", "Neff included into context state profile (1.0,20.0)", typeid(float), (void *) &neff, "^[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PROFILE),
+        PARAM_TAU(PARAM_TAU_ID, "--tau", "Tau", "Tau: context state pseudo count mixture (0.0,1.0)", typeid(float), (void *) &tau, "[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PROFILE),
         //createtsv
-        PARAM_TARGET_COLUMN(PARAM_TARGET_COLUMN_ID, "--target-column", "Target column", "Select a target column (default 1), 0 if no target id exists.",typeid(int),(void *) &targetTsvColumn, "^[0-9]*$"),
-        PARAM_FIRST_SEQ_REP_SEQ(PARAM_FIRST_SEQ_REP_SEQ_ID, "--first-seq-as-repr", "First sequence as representative", "Use the first sequence of the clustering result as representative sequence", typeid(bool), (void*) &firstSeqRepr, "", MMseqsParameter::COMMAND_MISC),
-        PARAM_FULL_HEADER(PARAM_FULL_HEADER_ID, "--full-header", "Add full header", "Replace DB ID by its corresponding Full Header", typeid(bool), (void*) &fullHeader, ""),
-        PARAM_IDX_SEQ_SRC(PARAM_IDX_SEQ_SRC_ID, "--idx-seq-src", "Sequence source", "0: auto, 1: split/translated sequences, 2: input sequences", typeid(int), (void*) &idxSeqSrc, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC),
+        PARAM_TARGET_COLUMN(PARAM_TARGET_COLUMN_ID, "--target-column", "Target column", "Select a target column (default 1), 0 if no target id exists", typeid(int), (void *) &targetTsvColumn, "^[0-9]*$"),
+        PARAM_FIRST_SEQ_REP_SEQ(PARAM_FIRST_SEQ_REP_SEQ_ID, "--first-seq-as-repr", "First sequence as representative", "Use the first sequence of the clustering result as representative sequence", typeid(bool), (void *) &firstSeqRepr, "", MMseqsParameter::COMMAND_MISC),
+        PARAM_FULL_HEADER(PARAM_FULL_HEADER_ID, "--full-header", "Add full header", "Replace DB ID by its corresponding Full Header", typeid(bool), (void *) &fullHeader, ""),
+        PARAM_IDX_SEQ_SRC(PARAM_IDX_SEQ_SRC_ID, "--idx-seq-src", "Sequence source", "0: auto, 1: split/translated sequences, 2: input sequences", typeid(int), (void *) &idxSeqSrc, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC),
 
         // result2stats
-        PARAM_STAT(PARAM_STAT_ID, "--stat", "Statistics to be computed", "can be one of: linecount, mean, doolittle, charges, seqlen, firstline.", typeid(std::string), (void*) &stat, ""),
+        PARAM_STAT(PARAM_STAT_ID, "--stat", "Statistics to be computed", "One of: linecount, mean, doolittle, charges, seqlen, firstline", typeid(std::string), (void *) &stat, ""),
         // linearcluster
-        PARAM_KMER_PER_SEQ(PARAM_KMER_PER_SEQ_ID, "--kmer-per-seq", "K-mers per sequence", "kmer per sequence", typeid(int), (void*) &kmersPerSequence, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_CLUSTLINEAR),
-        PARAM_KMER_PER_SEQ_SCALE(PARAM_KMER_PER_SEQ_SCALE_ID, "--kmer-per-seq-scale", "scale k-mers per sequence", "scale kmer per sequence based on sequence length as kmer-per-seq val + scale x seqlen", typeid(float), (void*) &kmersPerSequenceScale, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_EXPERT),
-        PARAM_INCLUDE_ONLY_EXTENDABLE(PARAM_INCLUDE_ONLY_EXTENDABLE_ID, "--include-only-extendable", "Include only extendable", "Include only extendable", typeid(bool), (void*) &includeOnlyExtendable, "", MMseqsParameter::COMMAND_CLUSTLINEAR),
-        PARAM_IGNORE_MULTI_KMER(PARAM_IGNORE_MULTI_KMER_ID, "--ignore-multi-kmer", "Skip repeating k-mers", "Skip kmers occuring multiple times (>=2)", typeid(bool), (void*) &ignoreMultiKmer, "", MMseqsParameter::COMMAND_CLUSTLINEAR|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_HASH_SHIFT(PARAM_HASH_SHIFT_ID, "--hash-shift", "Shift hash", "Shift k-mer hash", typeid(int), (void*) &hashShift, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_CLUSTLINEAR|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_PICK_N_SIMILAR(PARAM_HASH_SHIFT_ID, "--pick-n-sim-kmer", "Add N similar to search", "adds N similar to search", typeid(int), (void*) &pickNbest, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_CLUSTLINEAR|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_ADJUST_KMER_LEN(PARAM_ADJUST_KMER_LEN_ID, "--adjust-kmer-len", "Adjust k-mer length", "adjust k-mer length based on specificity (only for nucleotides)", typeid(bool), (void*) &adjustKmerLength, "", MMseqsParameter::COMMAND_CLUSTLINEAR|MMseqsParameter::COMMAND_EXPERT),
+        PARAM_KMER_PER_SEQ(PARAM_KMER_PER_SEQ_ID, "--kmer-per-seq", "k-mers per sequence", "k-mers per sequence", typeid(int), (void *) &kmersPerSequence, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_CLUSTLINEAR),
+        PARAM_KMER_PER_SEQ_SCALE(PARAM_KMER_PER_SEQ_SCALE_ID, "--kmer-per-seq-scale", "Scale k-mers per sequence", "Scale k-mer per sequence based on sequence length as kmer-per-seq val + scale x seqlen", typeid(MultiParam<float>), (void *) &kmersPerSequenceScale, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_CLUSTLINEAR),
+        PARAM_INCLUDE_ONLY_EXTENDABLE(PARAM_INCLUDE_ONLY_EXTENDABLE_ID, "--include-only-extendable", "Include only extendable", "Include only extendable", typeid(bool), (void *) &includeOnlyExtendable, "", MMseqsParameter::COMMAND_CLUSTLINEAR),
+        PARAM_IGNORE_MULTI_KMER(PARAM_IGNORE_MULTI_KMER_ID, "--ignore-multi-kmer", "Skip repeating k-mers", "Skip k-mers occuring multiple times (>=2)", typeid(bool), (void *) &ignoreMultiKmer, "", MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_HASH_SHIFT(PARAM_HASH_SHIFT_ID, "--hash-shift", "Shift hash", "Shift k-mer hash initilization", typeid(int), (void *) &hashShift, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_PICK_N_SIMILAR(PARAM_PICK_N_SIMILAR_ID, "--pick-n-sim-kmer", "Add N similar to search", "Add N similar k-mers to search", typeid(int), (void *) &pickNbest, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_ADJUST_KMER_LEN(PARAM_ADJUST_KMER_LEN_ID, "--adjust-kmer-len", "Adjust k-mer length", "Adjust k-mer length based on specificity (only for nucleotides)", typeid(bool), (void *) &adjustKmerLength, "", MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_RESULT_DIRECTION(PARAM_RESULT_DIRECTION_ID, "--result-direction", "Result direction", "result is 0: query, 1: target centric", typeid(int), (void *) &resultDirection, "^[0-1]{1}$", MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT),
 
         // workflow
-        PARAM_RUNNER(PARAM_RUNNER_ID, "--mpi-runner", "MPI runner","Use MPI on compute grid with this MPI command (e.g. \"mpirun -np 42\")",typeid(std::string),(void *) &runner, "", MMseqsParameter::COMMAND_COMMON|MMseqsParameter::COMMAND_EXPERT),
-        PARAM_REUSELATEST(PARAM_REUSELATEST_ID, "--force-reuse", "Force restart with latest tmp", "reuse tmp file in tmp/latest folder ignoring parameters and git version change", typeid(bool),(void *) &reuseLatest, "", MMseqsParameter::COMMAND_COMMON|MMseqsParameter::COMMAND_EXPERT),
+        PARAM_RUNNER(PARAM_RUNNER_ID, "--mpi-runner", "MPI runner", "Use MPI on compute cluster with this MPI command (e.g. \"mpirun -np 42\")", typeid(std::string), (void *) &runner, "", MMseqsParameter::COMMAND_COMMON | MMseqsParameter::COMMAND_EXPERT),
+        PARAM_REUSELATEST(PARAM_REUSELATEST_ID, "--force-reuse", "Force restart with latest tmp", "Reuse tmp filse in tmp/latest folder ignoring parameters and version changes", typeid(bool), (void *) &reuseLatest, "", MMseqsParameter::COMMAND_COMMON | MMseqsParameter::COMMAND_EXPERT),
         // search workflow
-        PARAM_NUM_ITERATIONS(PARAM_NUM_ITERATIONS_ID, "--num-iterations", "Number search iterations","Search iterations",typeid(int),(void *) &numIterations, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PROFILE),
-        PARAM_START_SENS(PARAM_START_SENS_ID, "--start-sens", "Start sensitivity","start sensitivity",typeid(float),(void *) &startSens, "^[0-9]*(\\.[0-9]+)?$"),
-        PARAM_SENS_STEPS(PARAM_SENS_STEPS_ID, "--sens-steps", "Search steps","Search steps performed from --start-sense and -s.",typeid(int),(void *) &sensSteps, "^[1-9]{1}$"),
-        PARAM_SLICE_SEARCH(PARAM_SLICE_SEARCH_ID, "--slice-search", "Run a seq-profile search in slice mode", "For bigger profile DB, run iteratively the search by greedily swapping the search results.", typeid(bool),(void *) &sliceSearch, "", MMseqsParameter::COMMAND_PROFILE|MMseqsParameter::COMMAND_EXPERT),
+        PARAM_NUM_ITERATIONS(PARAM_NUM_ITERATIONS_ID, "--num-iterations", "Search iterations", "Number of iterative profile search iterations", typeid(int), (void *) &numIterations, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PROFILE),
+        PARAM_START_SENS(PARAM_START_SENS_ID, "--start-sens", "Start sensitivity", "Start sensitivity", typeid(float), (void *) &startSens, "^[0-9]*(\\.[0-9]+)?$"),
+        PARAM_SENS_STEPS(PARAM_SENS_STEPS_ID, "--sens-steps", "Search steps", "Number of search steps performed from --start-sens to -s", typeid(int), (void *) &sensSteps, "^[1-9]{1}$"),
+        PARAM_SLICE_SEARCH(PARAM_SLICE_SEARCH_ID, "--slice-search", "Slice search mode", "For bigger profile DB, run iteratively the search by greedily swapping the search results", typeid(bool), (void *) &sliceSearch, "", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT),
         PARAM_STRAND(PARAM_STRAND_ID, "--strand", "Strand selection", "Strand selection only works for DNA/DNA search 0: reverse, 1: forward, 2: both", typeid(int), (void *) &strand, "^[0-2]{1}$", MMseqsParameter::COMMAND_EXPERT),
         // easysearch
-        PARAM_GREEDY_BEST_HITS(PARAM_GREEDY_BEST_HITS_ID, "--greedy-best-hits", "Greedy best hits", "Choose the best hits greedily to cover the query.", typeid(bool), (void*)&greedyBestHits, ""),
+        PARAM_GREEDY_BEST_HITS(PARAM_GREEDY_BEST_HITS_ID, "--greedy-best-hits", "Greedy best hits", "Choose the best hits greedily to cover the query", typeid(bool), (void *) &greedyBestHits, ""),
         // extractorfs
-        PARAM_ORF_MIN_LENGTH(PARAM_ORF_MIN_LENGTH_ID, "--min-length", "Min codons in orf", "minimum codon number in open reading frames",typeid(int),(void *) &orfMinLength, "^[1-9]{1}[0-9]*$"),
-        PARAM_ORF_MAX_LENGTH(PARAM_ORF_MAX_LENGTH_ID, "--max-length", "Max codons in length", "maximum codon number in open reading frames",typeid(int),(void *) &orfMaxLength, "^[1-9]{1}[0-9]*$"),
-        PARAM_ORF_MAX_GAP(PARAM_ORF_MAX_GAP_ID, "--max-gaps", "Max orf gaps", "maximum number of codons with gaps or unknown residues before an open reading frame is rejected",typeid(int),(void *) &orfMaxGaps, "^(0|[1-9]{1}[0-9]*)$"),
-        PARAM_CONTIG_START_MODE(PARAM_CONTIG_START_MODE_ID,"--contig-start-mode", "Contig start mode", "Contig start can be 0: incomplete, 1: complete, 2: both",typeid(int),(void *) &contigStartMode, "^[0-2]{1}"),
-        PARAM_CONTIG_END_MODE(PARAM_CONTIG_END_MODE_ID,"--contig-end-mode", "Contig end mode", "Contig end can be 0: incomplete, 1: complete, 2: both ",typeid(int),(void *) &contigEndMode, "^[0-2]{1}"),
-        PARAM_ORF_START_MODE(PARAM_ORF_START_MODE_ID,"--orf-start-mode", "Orf start mode", "Orf fragment can be 0: from start to stop, 1: from any to stop, 2: from last encountered start to stop (no start in the middle)",typeid(int),(void *) &orfStartMode, "^[0-2]{1}"),
-        PARAM_ORF_FORWARD_FRAMES(PARAM_ORF_FORWARD_FRAMES_ID, "--forward-frames", "Forward frames", "comma-seperated list of ORF frames on the forward strand to be extracted", typeid(std::string), (void *) &forwardFrames, ""),
-        PARAM_ORF_REVERSE_FRAMES(PARAM_ORF_REVERSE_FRAMES_ID, "--reverse-frames", "Reverse frames", "comma-seperated list of ORF frames on the reverse strand to be extracted", typeid(std::string), (void *) &reverseFrames, ""),
-        PARAM_USE_ALL_TABLE_STARTS(PARAM_USE_ALL_TABLE_STARTS_ID,"--use-all-table-starts", "Use all table starts", "use all alteratives for a start codon in the genetic table, if false - only ATG (AUG)",typeid(bool),(void *) &useAllTableStarts, ""),
-        PARAM_TRANSLATE(PARAM_TRANSLATE_ID,"--translate", "Translate orf", "translate ORF to amino acid",typeid(int),(void *) &translate, "^[0-1]{1}"),
+        PARAM_ORF_MIN_LENGTH(PARAM_ORF_MIN_LENGTH_ID, "--min-length", "Min codons in orf", "Minimum codon number in open reading frames", typeid(int), (void *) &orfMinLength, "^[1-9]{1}[0-9]*$"),
+        PARAM_ORF_MAX_LENGTH(PARAM_ORF_MAX_LENGTH_ID, "--max-length", "Max codons in length", "Maximum codon number in open reading frames", typeid(int), (void *) &orfMaxLength, "^[1-9]{1}[0-9]*$"),
+        PARAM_ORF_MAX_GAP(PARAM_ORF_MAX_GAP_ID, "--max-gaps", "Max orf gaps", "Maximum number of codons with gaps or unknown residues before an open reading frame is rejected", typeid(int), (void *) &orfMaxGaps, "^(0|[1-9]{1}[0-9]*)$"),
+        PARAM_CONTIG_START_MODE(PARAM_CONTIG_START_MODE_ID, "--contig-start-mode", "Contig start mode", "Contig start can be 0: incomplete, 1: complete, 2: both", typeid(int), (void *) &contigStartMode, "^[0-2]{1}"),
+        PARAM_CONTIG_END_MODE(PARAM_CONTIG_END_MODE_ID, "--contig-end-mode", "Contig end mode", "Contig end can be 0: incomplete, 1: complete, 2: both", typeid(int), (void *) &contigEndMode, "^[0-2]{1}"),
+        PARAM_ORF_START_MODE(PARAM_ORF_START_MODE_ID, "--orf-start-mode", "Orf start mode", "Orf fragment can be 0: from start to stop, 1: from any to stop, 2: from last encountered start to stop (no start in the middle)", typeid(int), (void *) &orfStartMode, "^[0-2]{1}"),
+        PARAM_ORF_FORWARD_FRAMES(PARAM_ORF_FORWARD_FRAMES_ID, "--forward-frames", "Forward frames", "Comma-seperated list of frames on the forward strand to be extracted", typeid(std::string), (void *) &forwardFrames, ""),
+        PARAM_ORF_REVERSE_FRAMES(PARAM_ORF_REVERSE_FRAMES_ID, "--reverse-frames", "Reverse frames", "Comma-seperated list of frames on the reverse strand to be extracted", typeid(std::string), (void *) &reverseFrames, ""),
+        PARAM_USE_ALL_TABLE_STARTS(PARAM_USE_ALL_TABLE_STARTS_ID, "--use-all-table-starts", "Use all table starts", "Use all alteratives for a start codon in the genetic table, if false - only ATG (AUG)", typeid(bool), (void *) &useAllTableStarts, ""),
+        PARAM_TRANSLATE(PARAM_TRANSLATE_ID, "--translate", "Translate orf", "Translate ORF to amino acid", typeid(int), (void *) &translate, "^[0-1]{1}"),
         PARAM_CREATE_LOOKUP(PARAM_CREATE_LOOKUP_ID, "--create-lookup", "Create lookup", "Create database lookup file (can be very large)", typeid(int), (void *) &createLookup, "^[0-1]{1}", MMseqsParameter::COMMAND_EXPERT),
         // indexdb
-        PARAM_CHECK_COMPATIBLE(PARAM_CHECK_COMPATIBLE_ID, "--check-compatible", "Check compatible", "0: Always recreate index, 1: Check if recreating index is needed, 2: Fail if index is incompatible", typeid(int), (void*) &checkCompatible, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC),
-        PARAM_SEARCH_TYPE(PARAM_SEARCH_TYPE_ID, "--search-type", "Search type", "search type 0: auto 1: amino acid, 2: translated, 3: nucleotide", typeid(int),(void *) &searchType, "^[0-3]{1}"),
+        PARAM_CHECK_COMPATIBLE(PARAM_CHECK_COMPATIBLE_ID, "--check-compatible", "Check compatible", "0: Always recreate index, 1: Check if recreating index is needed, 2: Fail if index is incompatible", typeid(int), (void *) &checkCompatible, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC),
+        PARAM_SEARCH_TYPE(PARAM_SEARCH_TYPE_ID, "--search-type", "Search type", "Search type 0: auto 1: amino acid, 2: translated, 3: nucleotide, 4: translated nucleotide alignment", typeid(int), (void *) &searchType, "^[0-4]{1}"),
         // createdb
-        PARAM_USE_HEADER(PARAM_USE_HEADER_ID,"--use-fasta-header", "Use fasta header", "use the id parsed from the fasta header as the index key instead of using incrementing numeric identifiers",typeid(bool),(void *) &useHeader, ""),
-        PARAM_ID_OFFSET(PARAM_ID_OFFSET_ID, "--id-offset", "Offset of numeric ids", "numeric ids in index file are offset by this value ",typeid(int),(void *) &identifierOffset, "^(0|[1-9]{1}[0-9]*)$"),
-        PARAM_DB_TYPE(PARAM_DB_TYPE_ID,"--dbtype", "Database type", "Database type 0: auto, 1: amino acid 2: nucleotides",typeid(int),(void *) &dbType, "[0-2]{1}"),
-        PARAM_CREATEDB_MODE(PARAM_CREATEDB_MODE_ID, "--createdb-mode", "Createdb mode", "createdb mode 0: copy data, 1: soft link data and write new index (works only with single line fasta/q)",typeid(int),(void *) &createdbMode, "^[0-1]{1}$"),
-        PARAM_SHUFFLE(PARAM_SHUFFLE_ID,"--shuffle", "Shuffle input database", "Shuffle input database",typeid(bool),(void *) &shuffleDatabase, ""),
-        PARAM_USE_HEADER_FILE(PARAM_USE_HEADER_FILE_ID, "--use-header-file", "Use ffindex header", "use the ffindex header file instead of the body to map the entry keys",typeid(bool),(void *) &useHeaderFile, ""),
+        PARAM_USE_HEADER(PARAM_USE_HEADER_ID, "--use-fasta-header", "Use fasta header", "Use the id parsed from the fasta header as the index key instead of using incrementing numeric identifiers", typeid(bool), (void *) &useHeader, ""),
+        PARAM_ID_OFFSET(PARAM_ID_OFFSET_ID, "--id-offset", "Offset of numeric ids", "Numeric ids in index file are offset by this value", typeid(int), (void *) &identifierOffset, "^(0|[1-9]{1}[0-9]*)$"),
+        PARAM_DB_TYPE(PARAM_DB_TYPE_ID, "--dbtype", "Database type", "Database type 0: auto, 1: amino acid 2: nucleotides", typeid(int), (void *) &dbType, "[0-2]{1}"),
+        PARAM_CREATEDB_MODE(PARAM_CREATEDB_MODE_ID, "--createdb-mode", "Createdb mode", "Createdb mode 0: copy data, 1: soft link data and write new index (works only with single line fasta/q)", typeid(int), (void *) &createdbMode, "^[0-1]{1}$"),
+        PARAM_SHUFFLE(PARAM_SHUFFLE_ID, "--shuffle", "Shuffle input database", "Shuffle input database", typeid(bool), (void *) &shuffleDatabase, ""),
+        PARAM_USE_HEADER_FILE(PARAM_USE_HEADER_FILE_ID, "--use-header-file", "Use header DB", "use the sequence header DB instead of the body to map the entry keys", typeid(bool), (void *) &useHeaderFile, ""),
         // splitsequence
-        PARAM_SEQUENCE_OVERLAP(PARAM_SEQUENCE_OVERLAP_ID, "--sequence-overlap", "Overlap between sequences", "overlap between sequences",typeid(int),(void *) &sequenceOverlap, "^(0|[1-9]{1}[0-9]*)$"),
-        PARAM_SEQUENCE_SPLIT_MODE(PARAM_SEQUENCE_SPLIT_MODE_ID, "--sequence-split-mode", "Sequence split mode", "sequence split mode 0: copy data, 1: soft link data and write new index,",typeid(int),(void *) &sequenceSplitMode, "^[0-1]{1}$"),
+        PARAM_SEQUENCE_OVERLAP(PARAM_SEQUENCE_OVERLAP_ID, "--sequence-overlap", "Overlap between sequences", "Overlap between sequences", typeid(int), (void *) &sequenceOverlap, "^(0|[1-9]{1}[0-9]*)$"),
+        PARAM_SEQUENCE_SPLIT_MODE(PARAM_SEQUENCE_SPLIT_MODE_ID, "--sequence-split-mode", "Sequence split mode", "Sequence split mode 0: copy data, 1: soft link data and write new index,", typeid(int), (void *) &sequenceSplitMode, "^[0-1]{1}$"),
         // gff2db
-        PARAM_GFF_TYPE(PARAM_GFF_TYPE_ID,"--gff-type", "GFF type", "type in the GFF file to filter by",typeid(std::string),(void *) &gffType, ""),
+        PARAM_GFF_TYPE(PARAM_GFF_TYPE_ID, "--gff-type", "GFF type", "Type in the GFF file to filter by", typeid(std::string), (void *) &gffType, ""),
         // translatenucs
-        PARAM_TRANSLATION_TABLE(PARAM_TRANSLATION_TABLE_ID,"--translation-table", "Translation table", "1) CANONICAL, 2) VERT_MITOCHONDRIAL, 3) YEAST_MITOCHONDRIAL, 4) MOLD_MITOCHONDRIAL, 5) INVERT_MITOCHONDRIAL, 6) CILIATE, 9) FLATWORM_MITOCHONDRIAL, 10) EUPLOTID, 11) PROKARYOTE, 12) ALT_YEAST, 13) ASCIDIAN_MITOCHONDRIAL, 14) ALT_FLATWORM_MITOCHONDRIAL, 15) BLEPHARISMA, 16) CHLOROPHYCEAN_MITOCHONDRIAL, 21) TREMATODE_MITOCHONDRIAL, 22) SCENEDESMUS_MITOCHONDRIAL, 23) THRAUSTOCHYTRIUM_MITOCHONDRIAL, 24) PTEROBRANCHIA_MITOCHONDRIAL, 25) GRACILIBACTERIA, 26) PACHYSOLEN, 27) KARYORELICT, 28) CONDYLOSTOMA, 29) MESODINIUM, 30) PERTRICH, 31) BLASTOCRITHIDIA", typeid(int),(void *) &translationTable, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_MISC|MMseqsParameter::COMMAND_EXPERT),
+        PARAM_TRANSLATION_TABLE(PARAM_TRANSLATION_TABLE_ID, "--translation-table", "Translation table", "1) CANONICAL, 2) VERT_MITOCHONDRIAL, 3) YEAST_MITOCHONDRIAL, 4) MOLD_MITOCHONDRIAL, 5) INVERT_MITOCHONDRIAL, 6) CILIATE\n9) FLATWORM_MITOCHONDRIAL, 10) EUPLOTID, 11) PROKARYOTE, 12) ALT_YEAST, 13) ASCIDIAN_MITOCHONDRIAL, 14) ALT_FLATWORM_MITOCHONDRIAL\n15) BLEPHARISMA, 16) CHLOROPHYCEAN_MITOCHONDRIAL, 21) TREMATODE_MITOCHONDRIAL, 22) SCENEDESMUS_MITOCHONDRIAL\n23) THRAUSTOCHYTRIUM_MITOCHONDRIAL, 24) PTEROBRANCHIA_MITOCHONDRIAL, 25) GRACILIBACTERIA, 26) PACHYSOLEN, 27) KARYORELICT, 28) CONDYLOSTOMA\n 29) MESODINIUM, 30) PERTRICH, 31) BLASTOCRITHIDIA", typeid(int), (void *) &translationTable, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_MISC | MMseqsParameter::COMMAND_EXPERT),
         // createseqfiledb
-        PARAM_ADD_ORF_STOP(PARAM_ADD_ORF_STOP_ID,"--add-orf-stop", "Add orf stop", "add * at complete start and end", typeid(bool),(void *) &addOrfStop, ""),
+        PARAM_ADD_ORF_STOP(PARAM_ADD_ORF_STOP_ID, "--add-orf-stop", "Add orf stop", "Add stop codon '*' at complete start and end", typeid(bool), (void *) &addOrfStop, ""),
         // createseqfiledb
-        PARAM_MIN_SEQUENCES(PARAM_MIN_SEQUENCES_ID,"--min-sequences", "Min sequences", "minimum number of sequences a cluster may contain", typeid(int),(void *) &minSequences,"^[1-9]{1}[0-9]*$"),
-        PARAM_MAX_SEQUENCES(PARAM_MAX_SEQUENCES_ID,"--max-sequences", "Max sequences", "maximum number of sequences a cluster may contain", typeid(int),(void *) &maxSequences,"^[1-9]{1}[0-9]*$"),
-        PARAM_HH_FORMAT(PARAM_HH_FORMAT_ID,"--hh-format", "HH format", "format entries to use with hhsuite (for singleton clusters)", typeid(bool), (void *) &hhFormat, ""),
+        PARAM_MIN_SEQUENCES(PARAM_MIN_SEQUENCES_ID, "--min-sequences", "Min sequences", "Minimum number of sequences a cluster may contain", typeid(int), (void *) &minSequences, "^[1-9]{1}[0-9]*$"),
+        PARAM_MAX_SEQUENCES(PARAM_MAX_SEQUENCES_ID, "--max-sequences", "Max sequences", "Maximum number of sequences a cluster may contain", typeid(int), (void *) &maxSequences, "^[1-9]{1}[0-9]*$"),
+        PARAM_HH_FORMAT(PARAM_HH_FORMAT_ID, "--hh-format", "HH format", "Format entries to use with hhsuite (for singleton clusters)", typeid(bool), (void *) &hhFormat, ""),
         // filterdb
-        PARAM_FILTER_COL(PARAM_FILTER_COL_ID,"--filter-column", "Filter column", "column", typeid(int),(void *) &filterColumn,"^[1-9]{1}[0-9]*$"),
-        PARAM_COLUMN_TO_TAKE(PARAM_COLUMN_TO_TAKE_ID,"--column-to-take", "Column to take", "column to take in join mode. If -1, the whole line is taken", typeid(int),(void *) &columnToTake,"^(-1|0|[1-9]{1}[0-9]*)$"),
-        PARAM_FILTER_REGEX(PARAM_FILTER_REGEX_ID,"--filter-regex", "Filter regex", "regex to select column (example float: [0-9]*(.[0-9]+)? int:[1-9]{1}[0-9])", typeid(std::string),(void *) &filterColumnRegex,"^.*$"),
-        PARAM_FILTER_POS(PARAM_FILTER_POS_ID,"--positive-filter", "Positive filter", "used in conjunction with --filter-file. If true, out  = in \\intersect filter ; if false, out = in - filter", typeid(bool),(void *) &positiveFilter,""),
-        PARAM_FILTER_FILE(PARAM_FILTER_FILE_ID,"--filter-file", "Filter file", "specify a file that contains the filtering elements", typeid(std::string),(void *) &filteringFile,""),
-        PARAM_FILTER_EXPRESSION(PARAM_FILTER_EXPRESSION_ID, "--filter-expression", "Filter expression", "Specify a mathematical expression to filter lines", typeid(std::string), (void*) &filterExpression, ""),
-        PARAM_MAPPING_FILE(PARAM_MAPPING_FILE_ID,"--mapping-file", "Mapping file", "specify a file that translates the keys of a DB to new keys, TSV format", typeid(std::string),(void *) &mappingFile,""),
-        PARAM_TRIM_TO_ONE_COL(PARAM_TRIM_TO_ONE_COL_ID,"--trim-to-one-column", "Trim to one column","Output only the column specified by --filter-column.",typeid(bool), (void *) &trimToOneColumn, ""),
-        PARAM_EXTRACT_LINES(PARAM_EXTRACT_LINES_ID,"--extract-lines", "Extract N lines", "extract n lines of each entry.",typeid(int), (void *) &extractLines, "^[1-9]{1}[0-9]*$"),
-        PARAM_COMP_OPERATOR(PARAM_COMP_OPERATOR_ID, "--comparison-operator", "Numerical comparison operator", "Filter by comparing each entry row numerically by using the le) less-than-equal, ge) greater-than-equal or e) equal operator.", typeid(std::string), (void *) &compOperator, ""),
-        PARAM_COMP_VALUE(PARAM_COMP_VALUE_ID, "--comparison-value", "Numerical comparison value", "Filter by comparing each entry to this value.", typeid(float), (void *) &compValue, "^.*$"),
-        PARAM_SORT_ENTRIES(PARAM_SORT_ENTRIES_ID, "--sort-entries", "Sort entries", "Sort column set by --filter-column, by 0: no sorting, 1: increasing, 2: decreasing, 3: random shuffle.", typeid(int), (void *) &sortEntries, "^[1-9]{1}[0-9]*$"),
-        PARAM_BEATS_FIRST(PARAM_BEATS_FIRST_ID, "--beats-first", "Beats first", "Filter by comparing each entry to the first entry.", typeid(bool), (void*) &beatsFirst, ""),
-        PARAM_JOIN_DB(PARAM_JOIN_DB_ID, "--join-db","join to DB", "Join another database entry with respect to the database identifier in the chosen column", typeid(std::string), (void*) &joinDB, ""),
-        PARAM_COMPUTE_POSITIONS(PARAM_COMPUTE_POSITIONS_ID, "--compute-positions", "Compute positions", "Add the positions of he hit on the target genome", typeid(std::string), (void*) &compPos, ""),
-        PARAM_TRANSITIVE_REPLACE(PARAM_TRANSITIVE_REPLACE_ID, "--transitive-replace", "Replace transitively", "Replace cluster name in a search file by all genes in this cluster", typeid(std::string), (void*) &clusterFile, ""),
+        PARAM_FILTER_COL(PARAM_FILTER_COL_ID, "--filter-column", "Filter column", "column", typeid(int), (void *) &filterColumn, "^[1-9]{1}[0-9]*$"),
+        PARAM_COLUMN_TO_TAKE(PARAM_COLUMN_TO_TAKE_ID, "--column-to-take", "Column to take", "column to take in join mode. If -1, the whole line is taken", typeid(int), (void *) &columnToTake, "^(-1|0|[1-9]{1}[0-9]*)$"),
+        PARAM_FILTER_REGEX(PARAM_FILTER_REGEX_ID, "--filter-regex", "Filter regex", "Regex to select column (example float: [0-9]*(.[0-9]+)? int:[1-9]{1}[0-9])", typeid(std::string), (void *) &filterColumnRegex, "^.*$"),
+        PARAM_FILTER_POS(PARAM_FILTER_POS_ID, "--positive-filter", "Positive filter", "Used in conjunction with --filter-file. If true, out  = in \\intersect filter ; if false, out = in - filter", typeid(bool), (void *) &positiveFilter, ""),
+        PARAM_FILTER_FILE(PARAM_FILTER_FILE_ID, "--filter-file", "Filter file", "Specify a file that contains the filtering elements", typeid(std::string), (void *) &filteringFile, ""),
+        PARAM_FILTER_EXPRESSION(PARAM_FILTER_EXPRESSION_ID, "--filter-expression", "Filter expression", "Specify a mathematical expression to filter lines", typeid(std::string), (void *) &filterExpression, ""),
+        PARAM_MAPPING_FILE(PARAM_MAPPING_FILE_ID, "--mapping-file", "Mapping file", "Specify a file that translates the keys of a DB to new keys, TSV format", typeid(std::string), (void *) &mappingFile, ""),
+        PARAM_TRIM_TO_ONE_COL(PARAM_TRIM_TO_ONE_COL_ID, "--trim-to-one-column", "Trim to one column", "Output only the column specified by --filter-column", typeid(bool), (void *) &trimToOneColumn, ""),
+        PARAM_EXTRACT_LINES(PARAM_EXTRACT_LINES_ID, "--extract-lines", "Extract N lines", "Extract n lines of each entry", typeid(int), (void *) &extractLines, "^[1-9]{1}[0-9]*$"),
+        PARAM_COMP_OPERATOR(PARAM_COMP_OPERATOR_ID, "--comparison-operator", "Numerical comparison operator", "Filter by comparing each entry row numerically by using the le) less-than-equal, ge) greater-than-equal or e) equal operator", typeid(std::string), (void *) &compOperator, ""),
+        PARAM_COMP_VALUE(PARAM_COMP_VALUE_ID, "--comparison-value", "Numerical comparison value", "Filter by comparing each entry to this value", typeid(double), (void *) &compValue, "^.*$"),
+        PARAM_SORT_ENTRIES(PARAM_SORT_ENTRIES_ID, "--sort-entries", "Sort entries", "Sort column set by --filter-column, by 0: no sorting, 1: increasing, 2: decreasing, 3: random shuffle", typeid(int), (void *) &sortEntries, "^[1-9]{1}[0-9]*$"),
+        PARAM_BEATS_FIRST(PARAM_BEATS_FIRST_ID, "--beats-first", "Beats first", "Filter by comparing each entry to the first entry", typeid(bool), (void *) &beatsFirst, ""),
+        PARAM_JOIN_DB(PARAM_JOIN_DB_ID, "--join-db", "join to DB", "Join another database entry with respect to the database identifier in the chosen column", typeid(std::string), (void *) &joinDB, ""),
         // besthitperset
-        PARAM_SIMPLE_BEST_HIT(PARAM_SIMPLE_BEST_HIT_ID, "--simple-best-hit", "Use simple best hit", "Update the p-value by a single best hit, or by best and second best hits", typeid(bool), (void*) &simpleBestHit, ""),
-        PARAM_ALPHA(PARAM_ALPHA_ID, "--alpha", "Alpha", "Set alpha for combining p-values during aggregation", typeid(float), (void*) &alpha, ""),
-        PARAM_SHORT_OUTPUT(PARAM_SHORT_OUTPUT_ID, "--short-output", "Short output", "The output database will contain only the spread p-value", typeid(bool), (void*) &shortOutput, ""),
-        PARAM_AGGREGATION_MODE(PARAM_AGGREGATION_MODE_ID, "--aggregation-mode", "Aggregation mode", "Combined P-values computed from 0: multi-hit, 1: minimum of all P-values, 2: product-of-P-values, 3: truncated product", typeid(int), (void*) &aggregationMode, "^[0-4]{1}$"),
+        PARAM_SIMPLE_BEST_HIT(PARAM_SIMPLE_BEST_HIT_ID, "--simple-best-hit", "Use simple best hit", "Update the p-value by a single best hit, or by best and second best hits", typeid(bool), (void *) &simpleBestHit, ""),
+        PARAM_ALPHA(PARAM_ALPHA_ID, "--alpha", "Alpha", "Set alpha for combining p-values during aggregation", typeid(float), (void *) &alpha, ""),
+        PARAM_SHORT_OUTPUT(PARAM_SHORT_OUTPUT_ID, "--short-output", "Short output", "The output database will contain only the spread p-value", typeid(bool), (void *) &shortOutput, ""),
+        PARAM_AGGREGATION_MODE(PARAM_AGGREGATION_MODE_ID, "--aggregation-mode", "Aggregation mode", "Combined P-values computed from 0: multi-hit, 1: minimum of all P-values, 2: product-of-P-values, 3: truncated product", typeid(int), (void *) &aggregationMode, "^[0-4]{1}$"),
         // concatdb
-        PARAM_PRESERVEKEYS(PARAM_PRESERVEKEYS_ID,"--preserve-keys", "Preserve the keys", "the keys of the two DB should be distinct, and they will be preserved in the concatenation.",typeid(bool), (void *) &preserveKeysB, ""),
-        PARAM_TAKE_LARGER_ENTRY(PARAM_TAKE_LARGER_ENTRY_ID,"--take-larger-entry", "Take the larger entry", "only keeps the larger entry (dataSize >) in the concatenation, both databases need the same keys in the index",typeid(bool), (void *) &takeLargerEntry, ""),
+        PARAM_PRESERVEKEYS(PARAM_PRESERVEKEYS_ID, "--preserve-keys", "Preserve the keys", "The keys of the two DB should be distinct, and they will be preserved in the concatenation", typeid(bool), (void *) &preserveKeysB, ""),
+        PARAM_TAKE_LARGER_ENTRY(PARAM_TAKE_LARGER_ENTRY_ID, "--take-larger-entry", "Take the larger entry", "Only keep the larger entry (dataSize >) in the concatenation, both databases need the same keys in the index", typeid(bool), (void *) &takeLargerEntry, ""),
         // offsetalignment
-        PARAM_CHAIN_ALIGNMENT(PARAM_CHAIN_ALIGNMENT_ID,"--chain-alignments", "Chain overlapping alignments", "Chain overlapping alignments",typeid(int),(void *) &chainAlignment, "^[0-1]{1}", MMseqsParameter::COMMAND_EXPERT),
-        PARAM_MERGE_QUERY(PARAM_MERGE_QUERY_ID,"--merge-query", "Merge query", "combine ORFs/split sequences to a single entry",typeid(int),(void *) &mergeQuery, "^[0-1]{1}", MMseqsParameter::COMMAND_EXPERT),
+        PARAM_CHAIN_ALIGNMENT(PARAM_CHAIN_ALIGNMENT_ID, "--chain-alignments", "Chain overlapping alignments", "Chain overlapping alignments", typeid(int), (void *) &chainAlignment, "^[0-1]{1}", MMseqsParameter::COMMAND_EXPERT),
+        PARAM_MERGE_QUERY(PARAM_MERGE_QUERY_ID, "--merge-query", "Merge query", "Combine ORFs/split sequences to a single entry", typeid(int), (void *) &mergeQuery, "^[0-1]{1}", MMseqsParameter::COMMAND_EXPERT),
         // tsv2db
-        PARAM_OUTPUT_DBTYPE(PARAM_OUTPUT_DBTYPE_ID,"--output-dbtype", "Output database type", "Set database type for resulting database: Amino acid sequences 0, Nucl. seq. 1, Profiles 2, Alignment result 5, Clustering result 6, Prefiltering result 7, Taxonomy result 8, Indexed database 9, cA3M MSAs 10, FASTA or A3M MSAs 11, Generic database 12, Omic dbtype file 13, Bi-directional prefiltering result 14, Offsetted headers 15",typeid(int),(void *) &outputDbType, "^(0|[1-9]{1}[0-9]*)$"),
+        PARAM_OUTPUT_DBTYPE(PARAM_OUTPUT_DBTYPE_ID, "--output-dbtype", "Output database type", "Set database type for resulting database: Amino acid sequences 0, Nucl. seq. 1, Profiles 2, Alignment result 5, Clustering result 6, Prefiltering result 7, Taxonomy result 8, Indexed database 9, cA3M MSAs 10, FASTA or A3M MSAs 11, Generic database 12, Omit dbtype file 13, Bi-directional prefiltering result 14, Offsetted headers 15", typeid(int), (void *) &outputDbType, "^(0|[1-9]{1}[0-9]*)$"),
         //diff
-        PARAM_USESEQID(PARAM_USESEQID_ID,"--use-seq-id", "Match sequences by their id.", "Sequence ID (Uniprot, GenBank, ...) is used for identifying matches between the old and the new DB.",typeid(bool), (void *) &useSequenceId, ""),
+        PARAM_USESEQID(PARAM_USESEQID_ID, "--use-seq-id", "Match sequences by their ID", "Sequence ID (Uniprot, GenBank, ...) is used for identifying matches between the old and the new DB", typeid(bool), (void *) &useSequenceId, ""),
         // prefixid
-        PARAM_PREFIX(PARAM_PREFIX_ID, "--prefix", "Prefix", "Use this prefix for all entries", typeid(std::string),(void *) &prefix,""),
-        PARAM_TSV(PARAM_TSV_ID,"--tsv", "Tsv", "should output be in TSV format",typeid(bool),(void *) &tsvOut, ""),
+        PARAM_PREFIX(PARAM_PREFIX_ID, "--prefix", "Prefix", "Use this prefix for all entries", typeid(std::string), (void *) &prefix, ""),
+        PARAM_TSV(PARAM_TSV_ID, "--tsv", "Tsv", "Return output in TSV format", typeid(bool), (void *) &tsvOut, ""),
         // summarize headers
-        PARAM_HEADER_TYPE(PARAM_HEADER_TYPE_ID,"--header-type", "Header type", "Header Type: 1 Uniclust, 2 Metaclust",typeid(int), (void *) &headerType, "[1-2]{1}"),
+        PARAM_HEADER_TYPE(PARAM_HEADER_TYPE_ID, "--header-type", "Header type", "Header Type: 1: Uniclust, 2: Metaclust", typeid(int), (void *) &headerType, "[1-2]{1}"),
         // mergedbs
-        PARAM_MERGE_PREFIXES(PARAM_MERGE_PREFIXES_ID, "--prefixes", "Merge prefixes", "Comma separated list of prefixes for each entry", typeid(std::string),(void *) &mergePrefixes,""),
+        PARAM_MERGE_PREFIXES(PARAM_MERGE_PREFIXES_ID, "--prefixes", "Merge prefixes", "Comma separated list of prefixes for each entry", typeid(std::string), (void *) &mergePrefixes, ""),
         // summarizeresult
-        PARAM_OVERLAP(PARAM_OVERLAP_ID, "--overlap", "Overlap threshold", "Maximum overlap of covered regions", typeid(float), (void*) &overlap, "^[0-9]*(\\.[0-9]+)?$"),
+        PARAM_OVERLAP(PARAM_OVERLAP_ID, "--overlap", "Overlap threshold", "Maximum overlap of covered regions", typeid(float), (void *) &overlap, "^[0-9]*(\\.[0-9]+)?$"),
         // msa2profile
-        PARAM_MSA_TYPE(PARAM_MSA_TYPE_ID,"--msa-type", "MSA type", "MSA Type: cA3M 0, A3M 1, FASTA 2", typeid(int), (void *) &msaType, "^[0-2]{1}$"),
+        PARAM_MSA_TYPE(PARAM_MSA_TYPE_ID, "--msa-type", "MSA type", "MSA Type: 0: cA3M, 1: A3M, 2: FASTA", typeid(int), (void *) &msaType, "^[0-2]{1}$"),
         // extractalignedregion
-        PARAM_EXTRACT_MODE(PARAM_EXTRACT_MODE_ID,"--extract-mode", "Extract mode", "Query 1, Target 2", typeid(int), (void *) &extractMode, "^[1-2]{1}$"),
+        PARAM_EXTRACT_MODE(PARAM_EXTRACT_MODE_ID, "--extract-mode", "Extract mode", "Extract from 1: Query, 2: Target", typeid(int), (void *) &extractMode, "^[1-2]{1}$"),
         // convertkb
         PARAM_KB_COLUMNS(PARAM_KB_COLUMNS_ID, "--kb-columns", "UniprotKB columns", "list of indices of UniprotKB columns to be extracted", typeid(std::string), (void *) &kbColumns, ""),
-        PARAM_RECOVER_DELETED(PARAM_RECOVER_DELETED_ID, "--recover-deleted", "Recover deleted", "Indicates if sequences are allowed to be be removed during updating", typeid(bool), (void*) &recoverDeleted, ""),
+        PARAM_RECOVER_DELETED(PARAM_RECOVER_DELETED_ID, "--recover-deleted", "Recover deleted", "Indicates if sequences are allowed to be be removed during updating", typeid(bool), (void *) &recoverDeleted, ""),
         // filtertaxdb
-        PARAM_TAXON_LIST(PARAM_TAXON_LIST_ID, "--taxon-list", "Selected taxons", "taxonomy ID, possibly multiple separated by ','", typeid(std::string), (void*) &taxonList, ""),
+        PARAM_TAXON_LIST(PARAM_TAXON_LIST_ID, "--taxon-list", "Selected taxa", "Taxonomy ID, possibly multiple values separated by ','", typeid(std::string), (void *) &taxonList, ""),
         // view
-        PARAM_ID_LIST(PARAM_ID_LIST_ID, "--id-list", "Selected entries with key", "entries to be printed seperated by ','", typeid(std::string), (void*) &idList, ""),
-        PARAM_IDX_ENTRY_TYPE(PARAM_IDX_ENTRY_TYPE_ID, "--idx-entry-type", "Index entry type", "sequence; 0, src sequence 1: header: 2, src header :3 (default 0)", typeid(int), (void*) &idxEntryType, "^[0-3]{1}$"),
+        PARAM_ID_LIST(PARAM_ID_LIST_ID, "--id-list", "Selected entries with key", "Entries to be printed seperated by ','", typeid(std::string), (void *) &idList, ""),
+        PARAM_IDX_ENTRY_TYPE(PARAM_IDX_ENTRY_TYPE_ID, "--idx-entry-type", "Index entry type", "0: sequence, 1: src sequence, 2: header, 3: src header", typeid(int), (void *) &idxEntryType, "^[0-3]{1}$"),
         // lca and addtaxonomy
-        PARAM_PICK_ID_FROM(PARAM_PICK_ID_FROM_ID,"--pick-id-from", "Extract mode", "Query 1, Target 2", typeid(int), (void *) &pickIdFrom, "^[1-2]{1}$"),
-        PARAM_LCA_RANKS(PARAM_LCA_RANKS_ID, "--lca-ranks", "LCA ranks", "Add column with specified ranks (':' separated)", typeid(std::string), (void*) &lcaRanks, ""),
-        PARAM_BLACKLIST(PARAM_BLACKLIST_ID, "--blacklist", "Blacklisted taxa", "Comma separated list of ignored taxa in LCA computation", typeid(std::string), (void*)&blacklist, "([0-9]+,)?[0-9]+"),
-        PARAM_TAXON_ADD_LINEAGE(PARAM_TAXON_ADD_LINEAGE_ID, "--tax-lineage", "Show taxon lineage", "Add column with full taxonomy lineage", typeid(bool), (void*)&showTaxLineage, ""),
+        PARAM_PICK_ID_FROM(PARAM_PICK_ID_FROM_ID, "--pick-id-from", "Extract mode", "Query 1, Target 2", typeid(int), (void *) &pickIdFrom, "^[1-2]{1}$"),
+        PARAM_LCA_RANKS(PARAM_LCA_RANKS_ID, "--lca-ranks", "LCA ranks", "Add column with specified ranks (',' separated)", typeid(std::string), (void *) &lcaRanks, ""),
+        PARAM_BLACKLIST(PARAM_BLACKLIST_ID, "--blacklist", "Taxon blacklist", "Comma separated list of ignored taxa in LCA computation", typeid(std::string), (void *) &blacklist, "([0-9]+,)?[0-9]+"),
+        PARAM_TAXON_ADD_LINEAGE(PARAM_TAXON_ADD_LINEAGE_ID, "--tax-lineage", "Show taxonomic lineage", "Add column with full taxonomy lineage", typeid(bool), (void *) &showTaxLineage, ""),
+        // aggregatetax
+        PARAM_MAJORITY(PARAM_MAJORITY_ID, "--majority", "Majority threshold", "minimal fraction of agreement among taxonomically assigned sequences of a set", typeid(float), (void *) &majorityThr, "^0(\\.[0-9]+)?|^1(\\.0+)?$"),
         // taxonomyreport
-        PARAM_REPORT_MODE(PARAM_REPORT_MODE_ID,"--report-mode", "Report mode", "Taxonomy report mode 0: Kraken 1: Krona", typeid(int), (void *) &reportMode, "^[0-1]{1}$"),
-        // createtaxcb
-        PARAM_NCBI_TAX_DUMP(PARAM_NCBI_TAX_DUMP_ID, "--ncbi-tax-dump", "NCBI tax dump directory", "NCBI tax dump directory. The tax dump can be downloaded here \"ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz\"", typeid(std::string), (void*) &ncbiTaxDump, ""),
-            PARAM_TAX_MAPPING_FILE(PARAM_TAX_MAPPING_FILE_ID, "--tax-mapping-file", "Taxonomical mapping file", "File to map sequence identifer to taxonomical identifier", typeid(std::string), (void*) &taxMappingFile, ""),
+        PARAM_REPORT_MODE(PARAM_REPORT_MODE_ID, "--report-mode", "Report mode", "Taxonomy report mode 0: Kraken 1: Krona", typeid(int), (void *) &reportMode, "^[0-1]{1}$"),
+        // createtaxdb
+        PARAM_NCBI_TAX_DUMP(PARAM_NCBI_TAX_DUMP_ID, "--ncbi-tax-dump", "NCBI tax dump directory", "NCBI tax dump directory. The tax dump can be downloaded here \"ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz\"", typeid(std::string), (void *) &ncbiTaxDump, ""),
+        PARAM_TAX_MAPPING_FILE(PARAM_TAX_MAPPING_FILE_ID, "--tax-mapping-file", "Taxonomical mapping file", "File to map sequence identifer to taxonomical identifier", typeid(std::string), (void *) &taxMappingFile, ""),
         // expandaln
-        PARAM_EXPANSION_MODE(PARAM_EXPANSION_MODE_ID, "--expansion-mode", "Expansion mode", "Which hits (still meeting the alignment criteria) to use when expanding the alignment results: 0 Use all hits, 1 Use only the best hit of each target", typeid(int), (void*) &expansionMode, "^[0-2]{1}$"),
+        PARAM_EXPANSION_MODE(PARAM_EXPANSION_MODE_ID, "--expansion-mode", "Expansion mode", "Which hits (still meeting the alignment criteria) to use when expanding the alignment results: 0 Use all hits, 1 Use only the best hit of each target", typeid(int), (void *) &expansionMode, "^[0-2]{1}$"),
         // taxonomy
-        PARAM_LCA_MODE(PARAM_LCA_MODE_ID, "--lca-mode", "LCA mode", "LCA Mode 1: Single Search LCA , 2: 2bLCA, 3: approx. 2bLCA, 4: top hit", typeid(int), (void*) &taxonomySearchMode, "^[1-4]{1}$"),
-        PARAM_TAX_OUTPUT_MODE(PARAM_TAX_OUTPUT_MODE_ID, "--tax-output-mode", "Taxonomy output mode", "0: output LCA, 1: output alignment", typeid(int), (void*) &taxonomyOutpuMode, "^[0-1]{1}$"),
-        // createsubdb
-        PARAM_SUBDB_MODE(PARAM_SUBDB_MODE_ID, "--subdb-mode", "Subdb mode", "Subdb mode 0: copy data  1: soft link data and write index", typeid(int), (void*) &subDbMode, "^[0-1]{1}$")
+        PARAM_LCA_MODE(PARAM_LCA_MODE_ID, "--lca-mode", "LCA mode", "LCA Mode 1: Single Search LCA , 2: 2bLCA, 3: approx. 2bLCA, 4: top hit", typeid(int), (void *) &taxonomySearchMode, "^[1-4]{1}$"),
+        PARAM_TAX_OUTPUT_MODE(PARAM_TAX_OUTPUT_MODE_ID, "--tax-output-mode", "Taxonomy output mode", "0: output LCA, 1: output alignment", typeid(int), (void *) &taxonomyOutpuMode, "^[0-1]{1}$"),
+        // createsubdb, filtertaxseqdb
+        PARAM_SUBDB_MODE(PARAM_SUBDB_MODE_ID, "--subdb-mode", "Subdb mode", "Subdb mode 0: copy data 1: soft link data and write index", typeid(int), (void *) &subDbMode, "^[0-1]{1}$"),
+        PARAM_TAR_INCLUDE(PARAM_TAR_INCLUDE_ID, "--tar-include", "Tar Inclusion Regex", "Include file names based on this regex", typeid(std::string), (void *) &tarInclude, "^.*$"),
+        PARAM_TAR_EXCLUDE(PARAM_TAR_EXCLUDE_ID, "--tar-exclude", "Tar Exclusion Regex", "Exclude file names based on this regex", typeid(std::string), (void *) &tarExclude, "^.*$"),
+        // for modules that should handle -h themselves
+        PARAM_HELP(PARAM_HELP_ID, "-h", "Help", "Help", typeid(bool), (void *) &help, "", MMseqsParameter::COMMAND_HIDDEN),
+        PARAM_HELP_LONG(PARAM_HELP_LONG_ID, "--help", "Help", "Help", typeid(bool), (void *) &help, "", MMseqsParameter::COMMAND_HIDDEN)
 {
     if (instance) {
         Debug(Debug::ERROR) << "Parameter instance already exists!\n";
@@ -278,6 +281,35 @@ Parameters::Parameters():
     threadsandcompression.push_back(&PARAM_COMPRESSED);
     threadsandcompression.push_back(&PARAM_V);
 
+    // alignall
+    alignall.push_back(&PARAM_SUB_MAT);
+    alignall.push_back(&PARAM_ADD_BACKTRACE);
+    alignall.push_back(&PARAM_ALIGNMENT_MODE);
+//    alignall.push_back(&PARAM_WRAPPED_SCORING);
+    alignall.push_back(&PARAM_E);
+    alignall.push_back(&PARAM_MIN_SEQ_ID);
+    alignall.push_back(&PARAM_MIN_ALN_LEN);
+    alignall.push_back(&PARAM_SEQ_ID_MODE);
+//    alignall.push_back(&PARAM_ALT_ALIGNMENT);
+    alignall.push_back(&PARAM_C);
+    alignall.push_back(&PARAM_COV_MODE);
+    alignall.push_back(&PARAM_MAX_SEQ_LEN);
+    alignall.push_back(&PARAM_NO_COMP_BIAS_CORR);
+//    alignall.push_back(&PARAM_REALIGN);
+//    alignall.push_back(&PARAM_MAX_REJECTED);
+//    alignall.push_back(&PARAM_MAX_ACCEPT);
+    alignall.push_back(&PARAM_INCLUDE_IDENTITY);
+    alignall.push_back(&PARAM_PRELOAD_MODE);
+    alignall.push_back(&PARAM_PCA);
+    alignall.push_back(&PARAM_PCB);
+    alignall.push_back(&PARAM_SCORE_BIAS);
+    alignall.push_back(&PARAM_GAP_OPEN);
+    alignall.push_back(&PARAM_GAP_EXTEND);
+    alignall.push_back(&PARAM_ZDROP);
+    alignall.push_back(&PARAM_THREADS);
+    alignall.push_back(&PARAM_COMPRESSED);
+    alignall.push_back(&PARAM_V);
+
     // alignment
     align.push_back(&PARAM_SUB_MAT);
     align.push_back(&PARAM_ADD_BACKTRACE);
@@ -302,6 +334,7 @@ Parameters::Parameters():
     align.push_back(&PARAM_SCORE_BIAS);
     align.push_back(&PARAM_GAP_OPEN);
     align.push_back(&PARAM_GAP_EXTEND);
+    align.push_back(&PARAM_ZDROP);
     align.push_back(&PARAM_THREADS);
     align.push_back(&PARAM_COMPRESSED);
     align.push_back(&PARAM_V);
@@ -396,7 +429,6 @@ Parameters::Parameters():
 
     // convertprofiledb
     convertprofiledb.push_back(&PARAM_SUB_MAT);
-    convertprofiledb.push_back(&PARAM_PROFILE_TYPE);
     convertprofiledb.push_back(&PARAM_THREADS);
     convertprofiledb.push_back(&PARAM_COMPRESSED);
     convertprofiledb.push_back(&PARAM_V);
@@ -514,6 +546,13 @@ Parameters::Parameters():
     //result2msa.push_back(&PARAM_FIRST_SEQ_REP_SEQ);
     result2msa.push_back(&PARAM_V);
 
+    // result2dnamsa
+    result2dnamsa.push_back(&PARAM_THREADS);
+    result2dnamsa.push_back(&PARAM_SKIP_QUERY);
+    result2dnamsa.push_back(&PARAM_COMPRESSED);
+    //result2msa.push_back(&PARAM_FIRST_SEQ_REP_SEQ);
+    result2dnamsa.push_back(&PARAM_V);
+
     // convertmsa
     convertmsa.push_back(&PARAM_IDENTIFIER_FIELD);
     convertmsa.push_back(&PARAM_COMPRESSED);
@@ -633,14 +672,15 @@ Parameters::Parameters():
     indexdb.push_back(&PARAM_SEARCH_TYPE);
     indexdb.push_back(&PARAM_SPLIT);
     indexdb.push_back(&PARAM_SPLIT_MEMORY_LIMIT);
-    indexdb.push_back(&PARAM_THREADS);
     indexdb.push_back(&PARAM_V);
+    indexdb.push_back(&PARAM_THREADS);
 
     // create kmer index
     kmerindexdb.push_back(&PARAM_SEED_SUB_MAT);
     kmerindexdb.push_back(&PARAM_K);
     kmerindexdb.push_back(&PARAM_HASH_SHIFT);
     kmerindexdb.push_back(&PARAM_KMER_PER_SEQ);
+    kmerindexdb.push_back(&PARAM_KMER_PER_SEQ_SCALE);
     kmerindexdb.push_back(&PARAM_MIN_SEQ_ID);
     kmerindexdb.push_back(&PARAM_ADJUST_KMER_LEN);
     kmerindexdb.push_back(&PARAM_SPLIT_MEMORY_LIMIT);
@@ -653,11 +693,10 @@ Parameters::Parameters():
     kmerindexdb.push_back(&PARAM_SEARCH_TYPE);
     kmerindexdb.push_back(&PARAM_SPACED_KMER_MODE);
     kmerindexdb.push_back(&PARAM_SPACED_KMER_PATTERN);
-    kmerindexdb.push_back(&PARAM_THREADS);
     kmerindexdb.push_back(&PARAM_V);
+    kmerindexdb.push_back(&PARAM_THREADS);
 
     // create db
-    createdb.push_back(&PARAM_MAX_SEQ_LEN);
     createdb.push_back(&PARAM_DB_TYPE);
     createdb.push_back(&PARAM_SHUFFLE);
     createdb.push_back(&PARAM_CREATEDB_MODE);
@@ -674,9 +713,9 @@ Parameters::Parameters():
     result2flat.push_back(&PARAM_V);
 
     // gff2db
-    gff2ffindex.push_back(&PARAM_GFF_TYPE);
-    gff2ffindex.push_back(&PARAM_ID_OFFSET);
-    gff2ffindex.push_back(&PARAM_V);
+    gff2db.push_back(&PARAM_GFF_TYPE);
+    gff2db.push_back(&PARAM_ID_OFFSET);
+    gff2db.push_back(&PARAM_V);
 
 
     // translate nucleotide
@@ -690,6 +729,7 @@ Parameters::Parameters():
     createseqfiledb.push_back(&PARAM_MIN_SEQUENCES);
     createseqfiledb.push_back(&PARAM_MAX_SEQUENCES);
     createseqfiledb.push_back(&PARAM_HH_FORMAT);
+    createseqfiledb.push_back(&PARAM_PRELOAD_MODE);
     createseqfiledb.push_back(&PARAM_THREADS);
     createseqfiledb.push_back(&PARAM_COMPRESSED);
     createseqfiledb.push_back(&PARAM_V);
@@ -703,8 +743,6 @@ Parameters::Parameters():
     filterDb.push_back(&PARAM_FILTER_FILE);
     filterDb.push_back(&PARAM_BEATS_FIRST);
     filterDb.push_back(&PARAM_MAPPING_FILE);
-    filterDb.push_back(&PARAM_THREADS);
-    filterDb.push_back(&PARAM_V);
     filterDb.push_back(&PARAM_TRIM_TO_ONE_COL);
     filterDb.push_back(&PARAM_EXTRACT_LINES);
     filterDb.push_back(&PARAM_COMP_OPERATOR);
@@ -712,9 +750,9 @@ Parameters::Parameters():
     filterDb.push_back(&PARAM_SORT_ENTRIES);
     filterDb.push_back(&PARAM_INCLUDE_IDENTITY);
     filterDb.push_back(&PARAM_JOIN_DB);
-    filterDb.push_back(&PARAM_COMPUTE_POSITIONS);
+    filterDb.push_back(&PARAM_THREADS);
     filterDb.push_back(&PARAM_COMPRESSED);
-    filterDb.push_back(&PARAM_TRANSITIVE_REPLACE);
+    filterDb.push_back(&PARAM_V);
 
     // besthitperset
     besthitbyset.push_back(&PARAM_SIMPLE_BEST_HIT);
@@ -776,6 +814,7 @@ Parameters::Parameters():
     clusthash.push_back(&PARAM_ALPH_SIZE);
     clusthash.push_back(&PARAM_MIN_SEQ_ID);
     clusthash.push_back(&PARAM_MAX_SEQ_LEN);
+    clusthash.push_back(&PARAM_PRELOAD_MODE);
     clusthash.push_back(&PARAM_THREADS);
     clusthash.push_back(&PARAM_COMPRESSED);
     clusthash.push_back(&PARAM_V);
@@ -785,6 +824,8 @@ Parameters::Parameters():
     kmermatcher.push_back(&PARAM_ALPH_SIZE);
     kmermatcher.push_back(&PARAM_MIN_SEQ_ID);
     kmermatcher.push_back(&PARAM_KMER_PER_SEQ);
+    kmermatcher.push_back(&PARAM_SPACED_KMER_MODE);
+    kmermatcher.push_back(&PARAM_SPACED_KMER_PATTERN);
     kmermatcher.push_back(&PARAM_KMER_PER_SEQ_SCALE);
     kmermatcher.push_back(&PARAM_ADJUST_KMER_LEN);
     kmermatcher.push_back(&PARAM_MASK_RESIDUES);
@@ -804,12 +845,14 @@ Parameters::Parameters():
     // kmermatcher
     kmersearch.push_back(&PARAM_SEED_SUB_MAT);
     kmersearch.push_back(&PARAM_KMER_PER_SEQ);
+    kmersearch.push_back(&PARAM_KMER_PER_SEQ_SCALE);
     kmersearch.push_back(&PARAM_MASK_RESIDUES);
     kmersearch.push_back(&PARAM_MASK_LOWER_CASE);
     kmersearch.push_back(&PARAM_COV_MODE);
     kmersearch.push_back(&PARAM_C);
     kmersearch.push_back(&PARAM_MAX_SEQ_LEN);
     kmersearch.push_back(&PARAM_PICK_N_SIMILAR);
+    kmersearch.push_back(&PARAM_RESULT_DIRECTION);
     kmersearch.push_back(&PARAM_SPLIT_MEMORY_LIMIT);
     kmersearch.push_back(&PARAM_THREADS);
     kmersearch.push_back(&PARAM_COMPRESSED);
@@ -896,6 +939,25 @@ Parameters::Parameters():
     // filtertaxdb
     filtertaxdb.push_back(&PARAM_COMPRESSED);
     filtertaxdb.push_back(&PARAM_TAXON_LIST);
+    filtertaxdb.push_back(&PARAM_THREADS);
+    filtertaxdb.push_back(&PARAM_V);
+
+    // filtertaxseqdb
+    filtertaxseqdb.push_back(&PARAM_COMPRESSED);
+    filtertaxseqdb.push_back(&PARAM_TAXON_LIST);
+    filtertaxseqdb.push_back(&PARAM_SUBDB_MODE);
+    filtertaxseqdb.push_back(&PARAM_THREADS);
+    filtertaxseqdb.push_back(&PARAM_V);
+
+    // aggregatetax
+    aggregatetax.push_back(&PARAM_COMPRESSED);
+    aggregatetax.push_back(&PARAM_MAJORITY);
+    aggregatetax.push_back(&PARAM_LCA_RANKS);
+    // TODO should we add this in the future?
+    //aggregatetax.push_back(&PARAM_BLACKLIST);
+    aggregatetax.push_back(&PARAM_TAXON_ADD_LINEAGE);
+    aggregatetax.push_back(&PARAM_THREADS);
+    aggregatetax.push_back(&PARAM_V);
 
     // lca
     lca.push_back(&PARAM_COMPRESSED);
@@ -916,10 +978,10 @@ Parameters::Parameters():
     createtaxdb.push_back(&PARAM_V);
 
     // addtaxonomy
-    addtaxonomy.push_back(&PARAM_PICK_ID_FROM);
-    addtaxonomy.push_back(&PARAM_COMPRESSED);
     addtaxonomy.push_back(&PARAM_TAXON_ADD_LINEAGE);
     addtaxonomy.push_back(&PARAM_LCA_RANKS);
+    addtaxonomy.push_back(&PARAM_PICK_ID_FROM);
+    addtaxonomy.push_back(&PARAM_COMPRESSED);
     addtaxonomy.push_back(&PARAM_THREADS);
     addtaxonomy.push_back(&PARAM_V);
 
@@ -1036,7 +1098,6 @@ Parameters::Parameters():
     taxonomy = combineList(searchworkflow, lca);
     taxonomy.push_back(&PARAM_LCA_MODE);
     taxonomy.push_back(&PARAM_TAX_OUTPUT_MODE);
-    taxonomy.push_back(&PARAM_USESEQID);
 
     // easy taxonomy
     easytaxonomy = combineList(taxonomy, addtaxonomy);
@@ -1075,41 +1136,53 @@ Parameters::Parameters():
     enrichworkflow = combineList(enrichworkflow, expandaln);
     enrichworkflow = combineList(enrichworkflow, result2profile);
 
+    databases.push_back(&PARAM_HELP);
+    databases.push_back(&PARAM_HELP_LONG);
+    databases.push_back(&PARAM_REUSELATEST);
+    databases.push_back(&PARAM_REMOVE_TMP_FILES);
+    databases.push_back(&PARAM_COMPRESSED);
+    databases.push_back(&PARAM_THREADS);
+    databases.push_back(&PARAM_V);
+
+    // tar2db
+    tar2db.push_back(&PARAM_OUTPUT_DBTYPE);
+    tar2db.push_back(&PARAM_TAR_INCLUDE);
+    tar2db.push_back(&PARAM_TAR_EXCLUDE);
+    tar2db.push_back(&PARAM_COMPRESSED);
+    tar2db.push_back(&PARAM_V);
+
     //checkSaneEnvironment();
     setDefaults();
 }
 
 
-void Parameters::printUsageMessage(const Command& command,
-                                   const unsigned int outputFlag){
+void Parameters::printUsageMessage(const Command& command, const unsigned int outputFlag, const char* extraText) {
     const std::vector<MMseqsParameter*>& parameters = *command.params;
-    bool printWholeHelpText = (outputFlag == 0xFFFFFFFF);
     std::ostringstream ss;
-    ss << "Usage: " << binary_name << " " << command.cmd << " " << command.usage << (parameters.size() > 0 ? " [options]" : "") << "\n\n";
-//    ss << (command.longDescription != NULL ? command.longDescription : command.shortDescription) << "\n\n";
-    const char * longDesc = ( command.longDescription != NULL) ?  command.longDescription : command.shortDescription;
-    const char * printDesc = (printWholeHelpText) ? longDesc : command.shortDescription;
-    ss << printDesc << "\n";
-    if(printWholeHelpText) {
-        ss <<" By " << command.author << "\n";
+    ss << "usage: " << binary_name << " " << command.cmd << " " << command.usage << (parameters.size() > 0 ? " [options]" : "") << "\n";
+    if (extraText != NULL) {
+        ss << extraText;
     }
-    ss << "\nOptions: ";
+    if (outputFlag == 0xFFFFFFFF) {
+        ss << " By " << command.author << "\n";
+    }
+    ss << "options: ";
 
     struct {
         const char* title;
-        int category;
+        unsigned int category;
     } categories[] = {
-            {"Prefilter",MMseqsParameter::COMMAND_PREFILTER},
-            {"Align",    MMseqsParameter::COMMAND_ALIGN},
-            {"Clust",    MMseqsParameter::COMMAND_CLUST},
-            {"Kmermatcher", MMseqsParameter::COMMAND_CLUSTLINEAR},
-            {"Profile",  MMseqsParameter::COMMAND_PROFILE},
-            {"Misc",     MMseqsParameter::COMMAND_MISC},
-            {"Common",   MMseqsParameter::COMMAND_COMMON},
-            {"Expert",   MMseqsParameter::COMMAND_EXPERT}
+            {"prefilter",MMseqsParameter::COMMAND_PREFILTER},
+            {"align",    MMseqsParameter::COMMAND_ALIGN},
+            {"clust",    MMseqsParameter::COMMAND_CLUST},
+            {"kmermatcher", MMseqsParameter::COMMAND_CLUSTLINEAR},
+            {"profile",  MMseqsParameter::COMMAND_PROFILE},
+            {"misc",     MMseqsParameter::COMMAND_MISC},
+            {"common",   MMseqsParameter::COMMAND_COMMON},
+            {"expert",   MMseqsParameter::COMMAND_EXPERT}
     };
 
-    bool printExpert = (MMseqsParameter::COMMAND_EXPERT & outputFlag) ;
+    const bool printExpert = MMseqsParameter::COMMAND_EXPERT & outputFlag;
     size_t maxParamWidth = 0;
     for (size_t i = 0; i < ARRAY_SIZE(categories); ++i) {
         for (size_t j = 0; j < parameters.size(); j++) {
@@ -1128,6 +1201,7 @@ void Parameters::printUsageMessage(const Command& command,
     bool printHeader = true;
     std::string paramString;
     paramString.reserve(100);
+    bool hasExpert = false;
     for (size_t i = 0; i < ARRAY_SIZE(categories); ++i) {
         bool categoryFound = false;
         for (size_t j = 0; j < parameters.size(); j++) {
@@ -1144,7 +1218,9 @@ void Parameters::printUsageMessage(const Command& command,
         }
         if (categoryFound) {
             paramString.clear();
-            paramString.append("\n ").append(categories[i].title).append(":");
+            if (outputFlag == 0xFFFFFFFF) {
+                paramString.append(categories[i].title).append(":");
+            }
             ss << paramString << std::string(maxParamWidth < paramString.size()? 1 : maxParamWidth - paramString.size(), ' ');
 
             if(printHeader==true){
@@ -1159,6 +1235,7 @@ void Parameters::printUsageMessage(const Command& command,
                 paramString.clear();
                 const MMseqsParameter * par = parameters[j];
                 bool isExpert = (par->category & MMseqsParameter::COMMAND_EXPERT);
+                hasExpert |= isExpert;
                 bool alreadyPrint = alreadyPrintMap[par->uniqid];
                 if (par->category & categories[i].category && (printExpert || isExpert == false) && alreadyPrint == false ) {
                     paramString.append(par->name);
@@ -1169,45 +1246,74 @@ void Parameters::printUsageMessage(const Command& command,
                     } else if (par->type == typeid(float)) {
                         paramString.append(" FLOAT");
                         valueString = SSTR(*(float *) par->value);
+                    } else if (par->type == typeid(double)) {
+                        paramString.append(" DOUBLE");
+                        valueString = SSTR(*(double *) par->value);
                     } else if (par->type == typeid(ByteParser)) {
                         paramString.append(" BYTE");
                         valueString = ByteParser::format(*((size_t *) par->value));
-                    } else if (par->type == typeid(ScoreMatrixFile)) {
-                        paramString.append(" MAT");
-                        valueString = ScoreMatrixFile::format(*((ScoreMatrixFile *) par->value));
                     } else if (par->type == typeid(bool)) {
-                        if (*(bool *)par->value == true) {
-                            paramString.append(" 0");
-                        }
-                        valueString = "1, set to 0 to disable";
+                        paramString.append(" BOOL");
+                        valueString = SSTR(*(bool *)par->value);
                     } else if (par->type == typeid(std::string)) {
                         paramString.append(" STR");
                         valueString = *((std::string *) par->value);
+                    } else if (par->type == typeid(MultiParam<char*>)) {
+                        paramString.append(" TWIN"); //nucl:VAL,aa:VAL"
+                        valueString = MultiParam<char*>::format(*((MultiParam<char*> *) par->value));
+                    } else if (par->type == typeid(MultiParam<int>)) {
+                        paramString.append(" TWIN"); //nucl:VAL,aa:VAL"
+                        valueString = MultiParam<int>::format(*((MultiParam<int> *) par->value));
+                    } else if (par->type == typeid(MultiParam<float>)) {
+                        paramString.append(" TWIN"); //nucl:VAL,aa:VAL"
+                        valueString = MultiParam<float>::format(*((MultiParam<float> *) par->value));
                     }
-                    ss << "   " << paramString << std::string(maxParamWidth < paramString.size()? 1 : maxParamWidth - paramString.size(), ' ');
 
-                    ss << " " << par->description;
-                    if (!(par->type == typeid(bool) && !(*(bool *)par->value))) {
-                        ss << " [" << valueString << "]";
+                    ss << " " << paramString << std::string(maxParamWidth < paramString.size()? 1 : maxParamWidth - paramString.size(), ' ');
+
+                    ss << " ";
+                    const char* description = par->description;
+                    while (description != NULL && *description != '\0') {
+                        ss.put(*description);
+                        if (*description == '\n') {
+                            ss << std::string(maxParamWidth + 2, ' ');
+                        }
+                        description++;
                     }
+                    ss << " [" << valueString << "]";
                     ss << std::endl;
                     alreadyPrintMap[par->uniqid] = true;
                 }
             }
         }
     }
-    if (printExpert == false) {
-        ss << "\n" << "An extended list of options can be obtained by calling '" << binary_name << " " << command.cmd << " -h'.\n";
-        if(command.citations > 0) {
-            ss << "\nReferences:\n";
-            for (unsigned int pos = 0 ; pos != sizeof(command.citations) * CHAR_BIT; ++pos) {
-                unsigned int citation = 1 << pos;
-                if (command.citations & citation && citations.find(citation) != citations.end()) {
-                    ss << " - " << citations.at(citation) << "\n";
-                }
+
+    if (command.examples) {
+        ss << "\nexamples:\n ";
+        const char *data = command.examples;
+        while (*data != '\0') {
+            ss.put(*data);
+            if (*data == '\n') {
+                ss.put(' ');
             }
+            data++;
+        }
+        if (*(data - 1) != '\n') {
+            ss.put('\n');
         }
     }
+    if (command.citations > 0) {
+        ss << "\nreferences:\n";
+        for (unsigned int pos = 0; pos != sizeof(command.citations) * CHAR_BIT; ++pos) {
+            unsigned int citation = 1 << pos;
+            if (command.citations & citation && citations.find(citation) != citations.end()) {
+                ss << " - " << citations.at(citation) << "\n";
+            }
+        }
+    }
+    if (printExpert == false && hasExpert) {
+        ss << "\n" << "Show an extended list of options by calling '" << binary_name << " " << command.cmd << " -h'.\n";
+    }
     Debug(Debug::INFO) << ss.str();
 }
 
@@ -1235,8 +1341,16 @@ void Parameters::parseParameters(int argc, const char *pargv[], const Command &c
                                  int outputFlags) {
     filenames.clear();
     std::vector<MMseqsParameter*> & par = *command.params;
+
+    bool canHandleHelp = false;
+    for (size_t parIdx = 0; parIdx < par.size(); parIdx++) {
+        if (par[parIdx]->uniqid == PARAM_HELP_ID || par[parIdx]->uniqid == PARAM_HELP_LONG_ID) {
+            canHandleHelp = true;
+        }
+    }
+
     size_t parametersFound = 0;
-    for(int argIdx = 0; argIdx < argc; argIdx++ ){
+    for (int argIdx = 0; argIdx < argc; argIdx++) {
         // it is a parameter if it starts with - or --
         const bool longParameter = (pargv[argIdx][0] == '-' && pargv[argIdx][1] == '-');
         if (longParameter || (pargv[argIdx][0] == '-')) {
@@ -1246,14 +1360,13 @@ void Parameters::parseParameters(int argc, const char *pargv[], const Command &c
                 break;
             }
             std::string parameter(pargv[argIdx]);
-            if (parameter.compare("-h") == 0 || parameter.compare("--help") == 0) {
+            if (canHandleHelp == false && (parameter.compare("-h") == 0 || parameter.compare("--help") == 0)) {
                 printUsageMessage(command, 0xFFFFFFFF);
                 EXIT(EXIT_SUCCESS);
             }
 
             bool hasUnrecognizedParameter = true;
-            for(size_t parIdx = 0; parIdx < par.size(); parIdx++){
-
+            for (size_t parIdx = 0; parIdx < par.size(); parIdx++) {
                 if(parameter.compare(par[parIdx]->name) == 0) {
                     if (typeid(bool) != par[parIdx]->type && argIdx + 1 == argc) {
                         printUsageMessage(command, outputFlags);
@@ -1274,7 +1387,7 @@ void Parameters::parseParameters(int argc, const char *pargv[], const Command &c
                         regfree(&regex);
                         // if no match found or two matches found (we want exactly one match)
                         if (nomatch){
-                            printUsageMessage(command, outputFlags);
+                            printUsageMessage(command, 0xFFFFFFFF);
                             Debug(Debug::ERROR) << "Error in argument " << par[parIdx]->name << "\n";
                             EXIT(EXIT_FAILURE);
                         }else{
@@ -1290,13 +1403,13 @@ void Parameters::parseParameters(int argc, const char *pargv[], const Command &c
 
                         // if no match found or two matches found (we want exactly one match)
                         if (nomatch){
-                            printUsageMessage(command, outputFlags);
+                            printUsageMessage(command, 0xFFFFFFFF);
                             Debug(Debug::ERROR) << "Error in argument regex " << par[parIdx]->name << "\n";
                             EXIT(EXIT_FAILURE);
                         } else {
                             size_t value = ByteParser::parse(pargv[argIdx+1]);
                             if (value == ByteParser::INVALID_SIZE) {
-                                printUsageMessage(command, outputFlags);
+                                printUsageMessage(command, 0xFFFFFFFF);
                                 Debug(Debug::ERROR) << "Error in value parsing " << par[parIdx]->name << "\n";
                                 EXIT(EXIT_FAILURE);
                             } else {
@@ -1305,24 +1418,46 @@ void Parameters::parseParameters(int argc, const char *pargv[], const Command &c
                             }
                         }
                         argIdx++;
-                    } else if (typeid(ScoreMatrixFile) == par[parIdx]->type) {
-                        ScoreMatrixFile value = ScoreMatrixFile(pargv[argIdx+1]);
-                        if (value == ScoreMatrixFile("INVALID", "INVALID")) {
-                            printUsageMessage(command, outputFlags);
+                    } else if (typeid(MultiParam<char*>) == par[parIdx]->type) {
+                        MultiParam<char*> value = MultiParam<char*>(pargv[argIdx+1]);
+                        if (value == MultiParam<char*>("INVALID", "INVALID")) {
+                            printUsageMessage(command, 0xFFFFFFFF);
+                            Debug(Debug::ERROR) << "Error in value parsing " << par[parIdx]->name << "\n";
+                            EXIT(EXIT_FAILURE);
+                        } else {
+                            *((MultiParam<char*> *) par[parIdx]->value) = value;
+                            par[parIdx]->wasSet = true;
+                        }
+                        argIdx++;
+                    }else if (typeid(MultiParam<int>) == par[parIdx]->type) {
+                        MultiParam<int> value = MultiParam<int>(pargv[argIdx+1]);
+                        if (value.aminoacids == INT_MAX || value.nucleotides == INT_MAX) {
+                            printUsageMessage(command, 0xFFFFFFFF);
+                            Debug(Debug::ERROR) << "Error in value parsing " << par[parIdx]->name << "\n";
+                            EXIT(EXIT_FAILURE);
+                        } else {
+                            *((MultiParam<int> *) par[parIdx]->value) = value;
+                            par[parIdx]->wasSet = true;
+                        }
+                        argIdx++;
+                    }else if (typeid(MultiParam<float>) == par[parIdx]->type) {
+                        MultiParam<float> value = MultiParam<float>(pargv[argIdx + 1]);
+                        if (value.aminoacids == FLT_MAX || value.nucleotides == FLT_MAX) {
+                            printUsageMessage(command, 0xFFFFFFFF);
                             Debug(Debug::ERROR) << "Error in value parsing " << par[parIdx]->name << "\n";
                             EXIT(EXIT_FAILURE);
                         } else {
-                            *((ScoreMatrixFile *) par[parIdx]->value) = value;
+                            *((MultiParam<float> *) par[parIdx]->value) = value;
                             par[parIdx]->wasSet = true;
                         }
                         argIdx++;
-                    } else if (typeid(float) == par[parIdx]->type) {
+                    }else if (typeid(float) == par[parIdx]->type) {
                         regex_t regex;
                         compileRegex(&regex, par[parIdx]->regex);
                         int nomatch = regexec(&regex, pargv[argIdx+1], 0, NULL, 0);
                         regfree(&regex);
                         if (nomatch){
-                            printUsageMessage(command, outputFlags);
+                            printUsageMessage(command, 0xFFFFFFFF);
                             Debug(Debug::ERROR) << "Error in argument " << par[parIdx]->name << "\n";
                             EXIT(EXIT_FAILURE);
                         }else{
@@ -1331,6 +1466,20 @@ void Parameters::parseParameters(int argc, const char *pargv[], const Command &c
                             par[parIdx]->wasSet = true;
                         }
                         argIdx++;
+                    } else if (typeid(double) == par[parIdx]->type) {
+                        regex_t regex;
+                        compileRegex(&regex, par[parIdx]->regex);
+                        int nomatch = regexec(&regex, pargv[argIdx+1], 0, NULL, 0);
+                        regfree(&regex);
+                        if (nomatch){
+                            printUsageMessage(command, 0xFFFFFFFF);
+                            Debug(Debug::ERROR) << "Error in argument " << par[parIdx]->name << "\n";
+                            EXIT(EXIT_FAILURE);
+                        }else{
+                            *((double *) par[parIdx]->value) = strtod(pargv[argIdx+1], NULL);
+                            par[parIdx]->wasSet = true;
+                        }
+                        argIdx++;
                     } else if (typeid(std::string) == par[parIdx]->type) {
                         std::string val(pargv[argIdx+1]);
                         if(val.length() != 0){
@@ -1417,36 +1566,48 @@ void Parameters::parseParameters(int argc, const char *pargv[], const Command &c
 #endif
 
 
+    bool ignorePathCountChecks = command.databases.empty() == false && command.databases[0].specialType & DbType::ZERO_OR_ALL && filenames.size() == 0;
     const size_t MAX_DB_PARAMETER = 6;
-
-    if (command.databases.size() > MAX_DB_PARAMETER) {
+    if (ignorePathCountChecks == false && command.databases.size() > MAX_DB_PARAMETER) {
         Debug(Debug::ERROR) << "Use argv if you need more than " << MAX_DB_PARAMETER << " db parameters" << "\n";
         EXIT(EXIT_FAILURE);
     }
 
-    if (filenames.size() < command.databases.size()){
+    if (ignorePathCountChecks == false && filenames.size() < command.databases.size()){
         printUsageMessage(command, outputFlags);
-        Debug(Debug::ERROR) << "Not enough input paths provied. Require " << command.databases.size() << " paths" << "\n";
+        Debug(Debug::ERROR) << "Not enough input paths provided. ";
+        if (command.databases.size() == 1) {
+            Debug(Debug::ERROR) << "1 path is required.\n";
+        } else {
+            Debug(Debug::ERROR) << command.databases.size() << " paths are required.\n";
+        }
         EXIT(EXIT_FAILURE);
     }
 
     bool isVar = false;
     bool isStartVar = false;
+    bool isMiddleVar = false;
     bool isEndVar = false;
-    if(command.databases[0].validator != NULL){
-    	if(command.databases.size() >= 2){
-        	isStartVar |= (command.databases[0].specialType & DbType::VARIADIC);
-        	isEndVar |= (command.databases[command.databases.size()-1].specialType & DbType::VARIADIC);
-        	isVar = isStartVar | isEndVar;
-    	}
-    	if(isVar == false){
-        	if (filenames.size() > command.databases.size()){
-            		printUsageMessage(command, outputFlags);
-            		Debug(Debug::ERROR) <<"Too many input paths provided. Only " << SSTR(command.databases.size()) << " are allowed\n";
-           		EXIT(EXIT_FAILURE);
-        	}
-   	}
-   }
+    if(command.databases.empty() == false && command.databases[0].validator != NULL) {
+        if (command.databases.size() >= 2) {
+            for(size_t i = 0; i < command.databases.size();i++){
+                if(i == 0){
+                    isStartVar |= (command.databases[i].specialType & DbType::VARIADIC);
+                } else if(i == command.databases.size() - 1){
+                    isEndVar |= (command.databases[i].specialType & DbType::VARIADIC);
+                } else {
+                    isMiddleVar |= (command.databases[i].specialType & DbType::VARIADIC);
+                }
+
+            }
+            isVar = isStartVar | isMiddleVar | isEndVar;
+        }
+        if (ignorePathCountChecks == false && isVar == false && filenames.size() > command.databases.size()) {
+            printUsageMessage(command, outputFlags);
+            Debug(Debug::ERROR) << "Too many input paths provided. Only " << SSTR(command.databases.size()) << " are allowed\n";
+            EXIT(EXIT_FAILURE);
+        }
+    }
     switch (std::min(filenames.size(), MAX_DB_PARAMETER)) {
         case 6:
             db6 = filenames[5];
@@ -1532,12 +1693,16 @@ void Parameters::parseParameters(int argc, const char *pargv[], const Command &c
                 break;
             // FALLTHROUGH
         case 0:
+            if (parseFlags & PARSE_ALLOW_EMPTY)
+                break;
             printUsageMessage(command, outputFlags);
             Debug(Debug::ERROR) << "Unrecognized parameters!" << "\n";
             printParameters(command.cmd, argc, pargv, par);
             EXIT(EXIT_FAILURE);
     }
-    checkIfDatabaseIsValid(command, isStartVar, isEndVar);
+    if (ignorePathCountChecks == false) {
+        checkIfDatabaseIsValid(command, isStartVar, isMiddleVar, isEndVar);
+    }
 
     if(printPar == true) {
         printParameters(command.cmd, argc, pargv, par);
@@ -1567,7 +1732,7 @@ void Parameters::checkIfTaxDbIsComplete(std::string & filename){
         }
 }
 
-void Parameters::checkIfDatabaseIsValid(const Command& command, bool isStartVar, bool isEndVar) {
+void Parameters::checkIfDatabaseIsValid(const Command& command, bool isStartVar, bool isMiddleVar, bool isEndVar) {
     size_t fileIdx = 0;
     for (size_t dbIdx = 0; dbIdx < command.databases.size(); dbIdx++) {
         const DbType &db = command.databases[dbIdx];
@@ -1577,10 +1742,13 @@ void Parameters::checkIfDatabaseIsValid(const Command& command, bool isStartVar,
             size_t argumentDist = 0;
             if(dbIdx == 0 && isStartVar){
                 argumentDist = (filenames.size() - command.databases.size());
-            }
-            if(dbIdx == command.databases.size() - 1 && isEndVar){
+            }else if(dbIdx == command.databases.size() - 1 && isEndVar){
+                argumentDist = (filenames.size() - command.databases.size());
+            }else if((command.databases[dbIdx].specialType & DbType::VARIADIC) && isMiddleVar){
                 argumentDist = (filenames.size() - command.databases.size());
             }
+
+
             size_t currFileIdx = fileIdx;
             for(; fileIdx <= currFileIdx+argumentDist; fileIdx++){
                 if (db.validator == NULL) {
@@ -1590,15 +1758,15 @@ void Parameters::checkIfDatabaseIsValid(const Command& command, bool isStartVar,
                 std::string dbTypeFile = std::string(filenames[fileIdx]) + ".dbtype";
                 // check if file exists
                 // if file is not a
-                if (FileUtil::fileExists((filenames[fileIdx]).c_str()) == false && FileUtil::fileExists(dbTypeFile.c_str()) == false ) {
+                if (FileUtil::fileExists((filenames[fileIdx]).c_str()) == false && FileUtil::fileExists(dbTypeFile.c_str()) == false && filenames[fileIdx] != "stdin" ) {
                     Debug(Debug::ERROR) << "Input " << filenames[fileIdx] << " does not exist.\n";
                     EXIT(EXIT_FAILURE);
                 }
                 int dbtype = FileUtil::parseDbType(filenames[fileIdx].c_str());
                 if (db.specialType & DbType::NEED_HEADER) {
-                    if (FileUtil::fileExists((filenames[fileIdx] + "_h").c_str()) == false && Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_INDEX_DB)==false) {
-                        Debug(Debug::ERROR) << "Database " << filenames[fileIdx] << " need header information.\n"
-                                            << "The " << filenames[fileIdx] << "_h is missing.\n";
+                    if (FileUtil::fileExists((filenames[fileIdx] + "_h.dbtype").c_str()) == false && Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_INDEX_DB)==false) {
+                        Debug(Debug::ERROR) << "Database " << filenames[fileIdx] << " needs header information.\n"
+                                            << filenames[fileIdx] << "_h is missing.\n";
                         EXIT(EXIT_FAILURE);
                     }
                 }
@@ -1607,15 +1775,17 @@ void Parameters::checkIfDatabaseIsValid(const Command& command, bool isStartVar,
                 }
                 if (db.specialType & DbType::NEED_LOOKUP) {
                     if (FileUtil::fileExists((filenames[fileIdx] + ".lookup").c_str()) == false) {
-                        Debug(Debug::ERROR) << "Database " << filenames[fileIdx] << " need a lookup file.\n"
-                                            << "The " << filenames[fileIdx] << ".lookup is missing.\n";
+                        Debug(Debug::ERROR) << "Database " << filenames[fileIdx] << " needs a lookup file.\n"
+                                            << filenames[fileIdx] << ".lookup is missing.\n";
                         EXIT(EXIT_FAILURE);
                     }
                 }
                 bool dbtypeFound = false;
                 for (size_t i = 0; i < db.validator->size() && dbtypeFound == false; i++) {
                     int validatorDbtype = db.validator->at(i);
-                    if (validatorDbtype == Parameters::DBTYPE_FLATFILE) {
+                    if (validatorDbtype == Parameters::DBTYPE_STDIN) {
+                        dbtypeFound = (filenames[fileIdx] == "stdin");
+                    } else if (validatorDbtype == Parameters::DBTYPE_FLATFILE) {
                         dbtypeFound = (FileUtil::fileExists(filenames[fileIdx].c_str()) == true &&
                                        FileUtil::directoryExists(filenames[fileIdx].c_str()) == false);
                     } else if (validatorDbtype == Parameters::DBTYPE_DIRECTORY) {
@@ -1652,7 +1822,7 @@ void Parameters::checkIfDatabaseIsValid(const Command& command, bool isStartVar,
                 fileIdx++;
             } else {
                 if (FileUtil::fileExists(filenames[fileIdx].c_str()) == true) {
-                    Debug(Debug::WARNING) << filenames[dbIdx] << " exists and will be overwritten.\n";
+                    Debug(Debug::WARNING) << filenames[fileIdx] << " exists and will be overwritten.\n";
                 }
                 fileIdx++;
 //                FILE *fp = fopen(filenames[dbIdx].c_str(), "a");
@@ -1668,6 +1838,8 @@ void Parameters::checkIfDatabaseIsValid(const Command& command, bool isStartVar,
 //                fclose(fp);
 //                FileUtil::remove(filenames[dbIdx].c_str());
             }
+        } else {
+            fileIdx++;
         }
     }
 }
@@ -1701,15 +1873,24 @@ void Parameters::printParameters(const std::string &module, int argc, const char
 
 
     for (size_t i = 0; i < par.size(); i++) {
+        if (par[i]->category & MMseqsParameter::COMMAND_HIDDEN) {
+            continue;
+        }
         ss << std::setw(maxWidth) << std::left << par[i]->display << "\t";
         if(typeid(int) == par[i]->type ){
             ss << *((int *)par[i]->value);
         } else if(typeid(ByteParser) == par[i]->type) {
             ss << ByteParser::format(*((size_t *)par[i]->value));
-        } else if(typeid(ScoreMatrixFile) == par[i]->type) {
-            ss << ScoreMatrixFile::format(*((ScoreMatrixFile *)par[i]->value));
+        } else if(typeid(MultiParam<char*>) == par[i]->type) {
+            ss << MultiParam<char*>::format(*((MultiParam<char*> *)par[i]->value));
+        } else if(typeid(MultiParam<int>) == par[i]->type) {
+            ss << MultiParam<int>::format(*((MultiParam<int> *)par[i]->value));
+        } else if(typeid(MultiParam<float>) == par[i]->type) {
+            ss << MultiParam<float>::format(*((MultiParam<float> *)par[i]->value));
         } else if(typeid(float) == par[i]->type) {
             ss << *((float *)par[i]->value);
+        } else if(typeid(double) == par[i]->type) {
+            ss << *((double *)par[i]->value);
         } else if(typeid(std::string) == par[i]->type) {
             ss << *((std::string *) par[i]->value);
         } else if (typeid(bool) == par[i]->type) {
@@ -1721,16 +1902,18 @@ void Parameters::printParameters(const std::string &module, int argc, const char
     Debug(Debug::INFO) << ss.str() << "\n";
 }
 
+
+
 void Parameters::setDefaults() {
     restArgv = NULL;
     restArgc = 0;
 
-    scoringMatrixFile =  ScoreMatrixFile("blosum62.out", "nucleotide.out");
-    seedScoringMatrixFile = ScoreMatrixFile("VTML80.out", "nucleotide.out");
+    scoringMatrixFile =  MultiParam<char*>("blosum62.out", "nucleotide.out");
+    seedScoringMatrixFile = MultiParam<char*>("VTML80.out", "nucleotide.out");
 
     kmerSize =  0;
     kmerScore = INT_MAX;
-    alphabetSize = 21;
+    alphabetSize = MultiParam<int>(21,5);
     maxSeqLen = MAX_SEQ_LEN; // 2^16
     maxResListLen = 300;
     sensitivity = 4;
@@ -1785,12 +1968,13 @@ void Parameters::setDefaults() {
     seqIdThr = 0.0;
     alnLenThr = 0;
     altAlignment = 0;
-    gapOpen = 11;
-    gapExtend = 1;
+    gapOpen = MultiParam<int>(11, 5);
+    gapExtend = MultiParam<int>(1, 2);
+    zdrop = 40;
     addBacktrace = false;
     realign = false;
     clusteringMode = SET_COVER;
-    cascaded = true;
+    singleStepClustering = false;
     clusterReassignment = 0;
     clusterSteps = 3;
     preloadMode = 0;
@@ -1811,9 +1995,6 @@ void Parameters::setDefaults() {
     // Clustering workflow
     removeTmpFiles = false;
 
-    // convertprofiledb
-    profileMode = PROFILE_MODE_HMM;
-
     // indexdb
     checkCompatible = 0;
     searchType = SEARCH_TYPE_AUTO;
@@ -1963,12 +2144,13 @@ void Parameters::setDefaults() {
 
     // linearcluster
     kmersPerSequence = 21;
-    kmersPerSequenceScale = 0.0;
+    kmersPerSequenceScale = MultiParam<float>(0.0, 0.2);
     includeOnlyExtendable = false;
     ignoreMultiKmer = false;
-    hashShift = 5;
+    hashShift = 67;
     pickNbest = 1;
     adjustKmerLength = false;
+    resultDirection = Parameters::PARAM_RESULT_DIRECTION_TARGET;
     // result2stats
     stat = "";
 
@@ -1982,7 +2164,7 @@ void Parameters::setDefaults() {
     taxMappingFile = "";
     ncbiTaxDump = "";
 
-    // filtertaxdb
+    // filtertaxdb, filtertaxseqdb
     taxonList = "";
 
     // view
@@ -1995,6 +2177,10 @@ void Parameters::setDefaults() {
     // createsubdb
     subDbMode = Parameters::SUBDB_MODE_HARD;
 
+    // tar2db
+    tarInclude = ".*";
+    tarExclude = "^$";
+
     lcaRanks = "";
     showTaxLineage = false;
     // bin for all unclassified sequences
@@ -2003,6 +2189,9 @@ void Parameters::setDefaults() {
     // https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=28384
     blacklist = "12908,28384";
 
+    // aggregatetax
+    majorityThr = 0;
+
     // taxonomyreport
     reportMode = 0;
 
@@ -2061,6 +2250,7 @@ size_t Parameters::hashParameter(const std::vector<std::string> &filenames, cons
     return Util::hash(hashString.c_str(), hashString.size());
 }
 
+
 std::string Parameters::createParameterString(const std::vector<MMseqsParameter*> &par, bool wasSet) {
     std::ostringstream ss;
     for (size_t i = 0; i < par.size(); ++i) {
@@ -2080,12 +2270,12 @@ std::string Parameters::createParameterString(const std::vector<MMseqsParameter*
         } else if (typeid(ByteParser) == par[i]->type) {
             ss << par[i]->name << " ";
             ss << ByteParser::format(*((size_t *)par[i]->value)) << " ";
-        } else if (typeid(ScoreMatrixFile) == par[i]->type) {
-            ss << par[i]->name << " ";
-            ss << ScoreMatrixFile::format(*((ScoreMatrixFile *)par[i]->value)) << " ";
         } else if (typeid(float) == par[i]->type){
             ss << par[i]->name << " ";
             ss << *((float *)par[i]->value) << " ";
+        } else if (typeid(double) == par[i]->type){
+            ss << par[i]->name << " ";
+            ss << *((double *)par[i]->value) << " ";
         } else if (typeid(std::string) == par[i]->type){
             if (*((std::string *) par[i]->value) != "") {
                 ss << par[i]->name << " ";
@@ -2098,6 +2288,15 @@ std::string Parameters::createParameterString(const std::vector<MMseqsParameter*
             } else {
                 ss << par[i]->name << " 0 ";
             }
+        } else if (typeid(MultiParam<char*>) == par[i]->type) {
+            ss << par[i]->name << " ";
+            ss << MultiParam<char*>::format(*((MultiParam<char*> *) par[i]->value)) << " ";
+        } else if (typeid(MultiParam<int>) == par[i]->type) {
+            ss << par[i]->name << " ";
+            ss << MultiParam<int>::format(*((MultiParam<int> *) par[i]->value)) << " ";
+        } else if (typeid(MultiParam<float>) == par[i]->type) {
+            ss << par[i]->name << " ";
+            ss << MultiParam<float>::format(*((MultiParam<float> *) par[i]->value)) << " ";
         } else {
             Debug(Debug::ERROR) << "Wrong parameter type. Please inform the developers!\n";
             EXIT(EXIT_FAILURE);
@@ -2117,24 +2316,16 @@ std::vector<MMseqsParameter*> Parameters::removeParameter(const std::vector<MMse
     return newParamList;
 }
 
-void Parameters::overrideParameterDescription(Command &command, const int uid,
-                                              const char *description, const char *regex, const int category) {
-    for (size_t i = 0; i < command.params->size(); i++) {
-        MMseqsParameter *p = command.params->at(i);
-        if (p->uniqid == uid) {
-            if (description != NULL) {
-                p->description = description;
-            }
-            if (regex != NULL) {
-                p->regex = regex;
-            }
-            if (category != 0) {
-                p->category = category;
-            }
-            break;
-        }
+void Parameters::overrideParameterDescription(MMseqsParameter& par, const char *description, const char *regex, int category) {
+    if (description != NULL) {
+        par.description = description;
+    }
+    if (regex != NULL) {
+        par.regex = regex;
+    }
+    if (category != 0) {
+        par.category = category;
     }
-
 }
 
 std::vector<int> Parameters::getOutputFormat(const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders,
diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h
index e86bd77..2f6cd11 100644
--- a/src/commons/Parameters.h
+++ b/src/commons/Parameters.h
@@ -12,7 +12,7 @@
 #include <utility>
 
 #include "Command.h"
-#include "ScoreMatrixFile.h"
+#include "MultiParam.h"
 
 #define PARAMETER(x) const static int x##_ID = __COUNTER__; \
     				 MMseqsParameter x;
@@ -36,6 +36,7 @@ struct MMseqsParameter {
     static const unsigned int COMMAND_MISC = 32;
     static const unsigned int COMMAND_CLUSTLINEAR = 64;
     static const unsigned int COMMAND_EXPERT = 128;
+    static const unsigned int COMMAND_HIDDEN = 256;
 
 
     MMseqsParameter(int uid, const char * n, const char *display,
@@ -43,6 +44,18 @@ struct MMseqsParameter {
                     void * value, const char * regex, unsigned int category = COMMAND_MISC):
             name(n), display(display), description(d), type(hash), value(value),
             regex(regex), uniqid(uid), category(category), wasSet(false){}
+
+    void addCategory(unsigned int cat) {
+        category |= cat;
+    }
+
+    void removeCategory(unsigned int cat) {
+        category &= ~cat;
+    }
+
+    void replaceCategory(unsigned int cat) {
+        category = cat;
+    }
 };
 
 
@@ -68,6 +81,8 @@ class Parameters {
     static const int DBTYPE_DIRECTORY = 16; // needed for verification
     static const int DBTYPE_FLATFILE = 17; // needed for verification
     static const int DBTYPE_SEQTAXDB = 18; // needed for verification
+    static const int DBTYPE_STDIN = 19; // needed for verification
+
 
     // don't forget to add new database types to DBReader::getDbTypeName and Parameters::PARAM_OUTPUT_DBTYPE
 
@@ -75,6 +90,7 @@ class Parameters {
     static const int SEARCH_TYPE_PROTEIN = 1;
     static const int SEARCH_TYPE_TRANSLATED = 2;
     static const int SEARCH_TYPE_NUCLEOTIDES = 3;
+    static const int SEARCH_TYPE_TRANS_NUCL_ALN = 4;
     // flag
     static const int SEARCH_MODE_FLAG_QUERY_AMINOACID = 1;
     static const int SEARCH_MODE_FLAG_TARGET_AMINOACID = 2;
@@ -85,15 +101,12 @@ class Parameters {
     static const int SEARCH_MODE_FLAG_QUERY_NUCLEOTIDE = 64;
     static const int SEARCH_MODE_FLAG_TARGET_NUCLEOTIDE = 128;
 
-
-
     static const unsigned int ALIGNMENT_MODE_FAST_AUTO = 0;
     static const unsigned int ALIGNMENT_MODE_SCORE_ONLY = 1;
     static const unsigned int ALIGNMENT_MODE_SCORE_COV = 2;
     static const unsigned int ALIGNMENT_MODE_SCORE_COV_SEQID = 3;
     static const unsigned int ALIGNMENT_MODE_UNGAPPED = 4;
 
-
     static const unsigned int WRITER_ASCII_MODE = 0;
     static const unsigned int WRITER_COMPRESSED_MODE = 1;
     static const unsigned int WRITER_LEXICOGRAPHIC_MODE = 2;
@@ -144,11 +157,6 @@ class Parameters {
     static std::vector<int> getOutputFormat(const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders,
                                             bool &needLookup, bool &needSource, bool &needTaxonomyMapping, bool &needTaxonomy);
 
-    // convertprofiledb
-    static const int PROFILE_MODE_HMM = 0;
-    static const int PROFILE_MODE_PSSM = 1;
-    static const int PROFILE_MODE_HMM3 = 2;
-
     // clustering
     static const int SET_COVER = 0;
     static const int CONNECTED_COMPONENT = 1;
@@ -175,6 +183,7 @@ class Parameters {
 
     static const int PARSE_VARIADIC = 1;
     static const int PARSE_REST = 2;
+    static const int PARSE_ALLOW_EMPTY = 4;
 
     // preload mode
     static const int PRELOAD_MODE_AUTO = 0;
@@ -201,11 +210,11 @@ class Parameters {
     static const int EXTRACT_TARGET = 2;
 
     static const int CLUST_HASH_DEFAULT_ALPH_SIZE = 3;
+    static const int CLUST_HASH_DEFAULT_MIN_SEQ_ID = 99;
     static const int CLUST_LINEAR_DEFAULT_ALPH_SIZE = 13;
     static const int CLUST_LINEAR_DEFAULT_K = 0;
     static const int CLUST_LINEAR_KMER_PER_SEQ = 0;
 
-
     // cov mode
     static const int COV_MODE_BIDIRECTIONAL  = 0;
     static const int COV_MODE_TARGET = 1;
@@ -240,10 +249,14 @@ class Parameters {
     static const int HEADER_TYPE_UNICLUST = 1;
     static const int HEADER_TYPE_METACLUST = 2;
 
-    // create subdb type
+    // createsubdb, filtertaxseqdb type
     static const int SUBDB_MODE_HARD = 0;
     static const int SUBDB_MODE_SOFT = 1;
 
+    // result direction
+    static const int PARAM_RESULT_DIRECTION_QUERY  = 0;
+    static const int PARAM_RESULT_DIRECTION_TARGET = 1;
+
     // path to databases
     std::string db1;
     std::string db1Index;
@@ -304,8 +317,8 @@ class Parameters {
     const char** restArgv;
     int restArgc;
 
-    ScoreMatrixFile scoringMatrixFile;       // path to scoring matrix
-    ScoreMatrixFile seedScoringMatrixFile;   // seed sub. matrix
+    MultiParam<char*> scoringMatrixFile;       // path to scoring matrix
+    MultiParam<char*> seedScoringMatrixFile;   // seed sub. matrix
     size_t maxSeqLen;                    // sequence length
     size_t maxResListLen;                // Maximal result list length per query
     int    verbosity;                    // log level
@@ -318,7 +331,7 @@ class Parameters {
     float  sensitivity;                  // target sens
     int    kmerSize;                     // kmer size for the prefilter
     int    kmerScore;                    // kmer score for the prefilter
-    int    alphabetSize;                 // alphabet size for the prefilter
+    MultiParam<int> alphabetSize;                 // alphabet size for the prefilter
     int    compBiasCorrection;           // Aminoacid composiont correction
     bool   diagonalScoring;              // switch diagonal scoring
     int    exactKmerMatching;            // only exact k-mer matching
@@ -352,8 +365,9 @@ class Parameters {
     int    alnLenThr;                    // min. alignment length
     bool   addBacktrace;                 // store backtrace string (M=Match, D=deletion, I=insertion)
     bool   realign;                      // realign hit with more conservative score
-    int    gapOpen;                      // gap open
-    int    gapExtend;                    // gap extend
+    MultiParam<int> gapOpen;             // gap open cost
+    MultiParam<int> gapExtend;           // gap extension cost
+    int    zdrop;                        // zdrop
 
     // workflow
     std::string runner;
@@ -362,7 +376,7 @@ class Parameters {
     // CLUSTERING
     int    clusteringMode;
     int    clusterSteps;
-    bool   cascaded;
+    bool   singleStepClustering;
     int    clusterReassignment;
 
     // SEARCH WORKFLOW
@@ -392,9 +406,6 @@ class Parameters {
     int translate;
     int createLookup;
 
-    // convertprofiledb
-    int profileMode;
-
     // convertalis
     int formatAlignmentMode;            // BLAST_TAB, PAIRWISE or SAM
     std::string outfmt;
@@ -451,12 +462,13 @@ class Parameters {
 
     // linearcluster
     int kmersPerSequence;
-    float kmersPerSequenceScale;
+    MultiParam<float> kmersPerSequenceScale;
     bool includeOnlyExtendable;
     bool ignoreMultiKmer;
     int hashShift;
     int pickNbest;
     int adjustKmerLength;
+    int resultDirection;
 
     // indexdb
     int checkCompatible;
@@ -500,13 +512,11 @@ class Parameters {
     bool positiveFilter;
     bool trimToOneColumn;
     int extractLines;
-    float compValue;
+    double compValue;
     std::string compOperator;
     int sortEntries;
     bool beatsFirst;
     std::string joinDB;
-    std::string compPos ;
-    std::string clusterFile ;
 
     // besthitperset
     bool simpleBestHit;
@@ -551,7 +561,7 @@ class Parameters {
     // summarize headers
     int headerType;
 
-    // filtertaxdb
+    // filtertaxdb, filtertaxseqdb
     std::string taxonList;
 
     // view
@@ -564,6 +574,9 @@ class Parameters {
     bool showTaxLineage;
     std::string blacklist;
 
+    // aggregatetax
+    float majorityThr;
+
     // taxonomyreport
     int reportMode;
 
@@ -581,6 +594,13 @@ class Parameters {
     // createsubdb
     int subDbMode;
 
+    // tar2db
+    std::string tarInclude;
+    std::string tarExclude;
+
+    // for modules that should handle -h themselves
+    bool help;
+
     // tool citations
     std::map<unsigned int, const char*> citations;
 
@@ -598,12 +618,11 @@ class Parameters {
     void setDefaults();
     void parseParameters(int argc, const char *pargv[], const Command &command, bool printPar, int parseFlags,
                          int outputFlags);
-    void printUsageMessage(const Command& command,
-                           unsigned int outputFlag);
+    void printUsageMessage(const Command& command, unsigned int outputFlag, const char* extraText = NULL);
     void printParameters(const std::string &module, int argc, const char* pargv[],
                          const std::vector<MMseqsParameter*> &par);
 
-    void checkIfDatabaseIsValid(const Command& command, bool isStartVar, bool isEndVar);
+    void checkIfDatabaseIsValid(const Command& command, bool isStartVar, bool isMiddleVar, bool isEndVar);
 
     std::vector<MMseqsParameter*> removeParameter(const std::vector<MMseqsParameter*>& par, const MMseqsParameter& x);
 
@@ -654,7 +673,7 @@ class Parameters {
     PARAMETER(PARAM_ALT_ALIGNMENT)
     PARAMETER(PARAM_GAP_OPEN)
     PARAMETER(PARAM_GAP_EXTEND)
-    std::vector<MMseqsParameter*> align;
+    PARAMETER(PARAM_ZDROP)
 
     // clustering
     PARAMETER(PARAM_CLUSTER_MODE)
@@ -670,9 +689,6 @@ class Parameters {
     PARAMETER(PARAM_V)
     std::vector<MMseqsParameter*> clust;
 
-    // create profile (HMM, PSSM)
-    PARAMETER(PARAM_PROFILE_TYPE)
-
     // format alignment
     PARAMETER(PARAM_FORMAT_MODE)
     PARAMETER(PARAM_FORMAT_OUTPUT)
@@ -734,7 +750,7 @@ class Parameters {
     PARAMETER(PARAM_HASH_SHIFT)
     PARAMETER(PARAM_PICK_N_SIMILAR)
     PARAMETER(PARAM_ADJUST_KMER_LEN)
-
+    PARAMETER(PARAM_RESULT_DIRECTION)
     // workflow
     PARAMETER(PARAM_RUNNER)
     PARAMETER(PARAM_REUSELATEST)
@@ -808,8 +824,6 @@ class Parameters {
     PARAMETER(PARAM_SORT_ENTRIES)
     PARAMETER(PARAM_BEATS_FIRST)
     PARAMETER(PARAM_JOIN_DB)
-    PARAMETER(PARAM_COMPUTE_POSITIONS)
-    PARAMETER(PARAM_TRANSITIVE_REPLACE)
 
     //besthitperset
     PARAMETER(PARAM_SIMPLE_BEST_HIT)
@@ -857,19 +871,22 @@ class Parameters {
     // clusterupdate
     PARAMETER(PARAM_RECOVER_DELETED)
 
-    // filtertaxdb
+    // filtertaxdb, filtertaxseqdb
     PARAMETER(PARAM_TAXON_LIST)
 
     // view
     PARAMETER(PARAM_ID_LIST)
     PARAMETER(PARAM_IDX_ENTRY_TYPE)
 
-    // lca and addtaxonomy
+    // lca, addtaxonomy and aggregatetax
     PARAMETER(PARAM_PICK_ID_FROM)
     PARAMETER(PARAM_LCA_RANKS)
     PARAMETER(PARAM_BLACKLIST)
     PARAMETER(PARAM_TAXON_ADD_LINEAGE)
 
+    // aggregatetax
+    PARAMETER(PARAM_MAJORITY)
+
     // taxonomyreport
     PARAMETER(PARAM_REPORT_MODE)
 
@@ -886,6 +903,14 @@ class Parameters {
     // createsubdb
     PARAMETER(PARAM_SUBDB_MODE)
 
+    // tar2db
+    PARAMETER(PARAM_TAR_INCLUDE)
+    PARAMETER(PARAM_TAR_EXCLUDE)
+
+    // for modules that should handle -h themselves
+    PARAMETER(PARAM_HELP)
+    PARAMETER(PARAM_HELP_LONG)
+
 
     std::vector<MMseqsParameter*> empty;
     std::vector<MMseqsParameter*> onlyverbosity;
@@ -894,6 +919,8 @@ class Parameters {
     std::vector<MMseqsParameter*> onlythreads;
     std::vector<MMseqsParameter*> threadsandcompression;
 
+    std::vector<MMseqsParameter*> alignall;
+    std::vector<MMseqsParameter*> align;
     std::vector<MMseqsParameter*> rescorediagonal;
     std::vector<MMseqsParameter*> alignbykmer;
     std::vector<MMseqsParameter*> createFasta;
@@ -902,6 +929,7 @@ class Parameters {
     std::vector<MMseqsParameter*> result2profile;
     std::vector<MMseqsParameter*> result2pp;
     std::vector<MMseqsParameter*> result2msa;
+    std::vector<MMseqsParameter*> result2dnamsa;
     std::vector<MMseqsParameter*> convertmsa;
     std::vector<MMseqsParameter*> msa2profile;
     std::vector<MMseqsParameter*> createtsv;
@@ -920,7 +948,7 @@ class Parameters {
     std::vector<MMseqsParameter*> createdb;
     std::vector<MMseqsParameter*> convert2fasta;
     std::vector<MMseqsParameter*> result2flat;
-    std::vector<MMseqsParameter*> gff2ffindex;
+    std::vector<MMseqsParameter*> gff2db;
     std::vector<MMseqsParameter*> clusthash;
     std::vector<MMseqsParameter*> kmermatcher;
     std::vector<MMseqsParameter*> kmersearch;
@@ -960,6 +988,8 @@ class Parameters {
     std::vector<MMseqsParameter*> addtaxonomy;
     std::vector<MMseqsParameter*> taxonomyreport;
     std::vector<MMseqsParameter*> filtertaxdb;
+    std::vector<MMseqsParameter*> filtertaxseqdb;
+    std::vector<MMseqsParameter*> aggregatetax;
     std::vector<MMseqsParameter*> taxonomy;
     std::vector<MMseqsParameter*> easytaxonomy;
     std::vector<MMseqsParameter*> createsubdb;
@@ -974,6 +1004,8 @@ class Parameters {
     std::vector<MMseqsParameter*> expandaln;
     std::vector<MMseqsParameter*> sortresult;
     std::vector<MMseqsParameter*> enrichworkflow;
+    std::vector<MMseqsParameter*> databases;
+    std::vector<MMseqsParameter*> tar2db;
 
     std::vector<MMseqsParameter*> combineList(const std::vector<MMseqsParameter*> &par1,
                                              const std::vector<MMseqsParameter*> &par2);
@@ -982,7 +1014,7 @@ class Parameters {
 
     std::string createParameterString(const std::vector<MMseqsParameter*> &vector, bool wasSet = false);
 
-    void overrideParameterDescription(Command& command, int uid, const char* description, const char* regex = NULL, int category = 0);
+    void overrideParameterDescription(MMseqsParameter& par, const char *description, const char *regex = NULL, int category = 0);
 
     static void checkIfTaxDbIsComplete(std::string & filename);
 
diff --git a/src/commons/ScoreMatrixFile.cpp b/src/commons/ScoreMatrixFile.cpp
deleted file mode 100644
index 55aa388..0000000
--- a/src/commons/ScoreMatrixFile.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-#include "ScoreMatrixFile.h"
-#include "Debug.h"
-
-#include <cstring>
-#include <cstdlib>
-
-ScoreMatrixFile::ScoreMatrixFile(const char* filename) {
-    if (strchr(filename, ',') != NULL) {
-        size_t len = strlen(filename);
-        aminoacids = (char*) malloc(len * sizeof(char));
-        nucleotides = (char*) malloc(len * sizeof(char));
-        if (sscanf(filename, "aa:%[^,],nucl:%s", aminoacids, nucleotides) != 2 && sscanf(filename, "nucl:%[^,],aa:%s", nucleotides, aminoacids) != 2) {
-            free(nucleotides);
-            free(aminoacids);
-            nucleotides = strdup("INVALID");
-            aminoacids = strdup("INVALID");
-        }
-    } else {
-        nucleotides = strdup(filename);
-        aminoacids = strdup(filename);
-    }
-}
-
-ScoreMatrixFile::ScoreMatrixFile(const char* aminoacids, const char* nucleotides) {
-    this->nucleotides = strdup(nucleotides);
-    this->aminoacids = strdup(aminoacids);
-}
-
-ScoreMatrixFile& ScoreMatrixFile::operator=(const ScoreMatrixFile& other) {
-    if (nucleotides != NULL) {
-        free(nucleotides);
-    }
-    if (aminoacids != NULL) {
-        free(aminoacids);
-    }
-    nucleotides = strdup(other.nucleotides);
-    aminoacids = strdup(other.aminoacids);
-    return *this;
-}
-
-ScoreMatrixFile::~ScoreMatrixFile() {
-    free(nucleotides);
-    free(aminoacids);
-}
-
-bool ScoreMatrixFile::operator==(const char* other) const {
-    return strncmp(other, nucleotides, strlen(nucleotides)) == 0 || strncmp(other, aminoacids, strlen(aminoacids)) == 0;
-}
-
-bool ScoreMatrixFile::operator==(const std::string& other) const {
-    return strncmp(other.c_str(), nucleotides, strlen(nucleotides)) == 0 || strncmp(other.c_str(), aminoacids, strlen(aminoacids)) == 0;
-}
-
-bool ScoreMatrixFile::operator==(const ScoreMatrixFile& other) const {
-    return strncmp(other.nucleotides, nucleotides, strlen(nucleotides)) == 0 && strncmp(other.aminoacids, aminoacids, strlen(aminoacids)) == 0;
-}
-
-std::string ScoreMatrixFile::format(const ScoreMatrixFile &file) {
-    if (strncmp(file.nucleotides, file.aminoacids, strlen(file.aminoacids)) == 0) {
-        return file.nucleotides;
-    } else {
-        return std::string("nucl:") + file.nucleotides + ",aa:" + file.aminoacids;
-    }
-}
diff --git a/src/commons/ScoreMatrixFile.h b/src/commons/ScoreMatrixFile.h
deleted file mode 100644
index 4b7ae1c..0000000
--- a/src/commons/ScoreMatrixFile.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef SCOREMATRIXFILE_H
-#define SCOREMATRIXFILE_H
-
-#include <string>
-
-class ScoreMatrixFile {
-public:
-    explicit ScoreMatrixFile(const char* filename);
-    explicit ScoreMatrixFile(const std::string& filename) : ScoreMatrixFile(filename.c_str()) {}
-    ScoreMatrixFile(const char* aminoacids, const char* nucleotides);
-    ScoreMatrixFile(const ScoreMatrixFile& copy) : ScoreMatrixFile(copy.aminoacids, copy.nucleotides) {}
-    ScoreMatrixFile& operator=(const ScoreMatrixFile& other);
-    ~ScoreMatrixFile();
-
-    bool operator==(const char* other) const;
-    bool operator==(const std::string& other) const;
-    bool operator==(const ScoreMatrixFile& other) const;
-
-    bool operator!=(const char* other) const {
-        return !(operator==(other));
-    }
-    bool operator!=(const std::string& other) const {
-        return !(operator==(other));
-    }
-    bool operator!=(const ScoreMatrixFile& other) const {
-        return !(operator==(other));
-    }
-
-    static std::string format(const ScoreMatrixFile &file);
-
-    char* nucleotides;
-    char* aminoacids;
-};
-
-#endif
diff --git a/src/commons/Sequence.cpp b/src/commons/Sequence.cpp
index 91de4d0..33a74e3 100644
--- a/src/commons/Sequence.cpp
+++ b/src/commons/Sequence.cpp
@@ -2,7 +2,6 @@
 #include "Debug.h"
 #include "Util.h"
 #include "simd.h"
-#include "ScoreMatrix.h"
 #include "SubstitutionMatrix.h"
 #include "Parameters.h"
 #include "MathUtil.h"
@@ -15,14 +14,14 @@
 Sequence::Sequence(size_t maxLen, int seqType, const BaseMatrix *subMat, const unsigned int kmerSize, const bool spaced, const bool aaBiasCorrection, bool shouldAddPC, const std::string& spacedKmerPattern)
  : spacedKmerPattern(spacedKmerPattern) {
     this->maxLen = maxLen;
-    this->int_sequence = new int[maxLen + 1];
-    this->int_consensus_sequence = new int[maxLen + 1];
+    this->numSequence = static_cast<unsigned char*>(malloc(maxLen + 1));
+    this->numConsensusSequence = new unsigned char[maxLen + 1];
     this->aaBiasCorrection = aaBiasCorrection;
     this->subMat = (BaseMatrix*)subMat;
     this->spaced = spaced;
     this->seqType = seqType;
     std::pair<const char *, unsigned int> spacedKmerInformation;
-    if (spacedKmerPattern.size() == 0){
+    if (spacedKmerPattern.empty()) {
         spacedKmerInformation = getSpacedPattern(spaced, kmerSize);
     } else {
         spacedKmerInformation = parseSpacedPattern(kmerSize, spaced, spacedKmerPattern);
@@ -34,7 +33,10 @@ Sequence::Sequence(size_t maxLen, int seqType, const BaseMatrix *subMat, const u
     this->aaPosInSpacedPattern = NULL;
     this->shouldAddPC = shouldAddPC;
     if(spacedPatternSize){
-        this->kmerWindow = new int[kmerSize];
+        simdKmerRegisterCnt = (kmerSize / (VECSIZE_INT*4)) + 1;
+        unsigned int simdKmerLen =  simdKmerRegisterCnt *  (VECSIZE_INT*4); // for SIMD memory alignment
+        this->kmerWindow = (unsigned char*) mem_align(ALIGN_INT, simdKmerLen * sizeof(unsigned char));
+        memset(this->kmerWindow, 0, simdKmerLen * sizeof(unsigned char));
         this->aaPosInSpacedPattern = new unsigned char[kmerSize];
         if(spacedPattern == NULL ) {
             Debug(Debug::ERROR) << "Sequence does not have a kmerSize (kmerSize= " << spacedPatternSize << ") to use nextKmer.\n";
@@ -81,10 +83,10 @@ Sequence::Sequence(size_t maxLen, int seqType, const BaseMatrix *subMat, const u
 
 Sequence::~Sequence() {
     delete[] spacedPattern;
-    delete[] int_sequence;
-    delete[] int_consensus_sequence;
+    free(numSequence);
+    delete[] numConsensusSequence;
     if (kmerWindow) {
-        delete[] kmerWindow;
+        free(kmerWindow);
     }
     if (aaPosInSpacedPattern){
         delete[] aaPosInSpacedPattern;
@@ -151,7 +153,7 @@ std::pair<const char *, unsigned int> Sequence::getSpacedPattern(bool spaced, un
             for(size_t i = 0; i < kmerSize; i++){
                 pattern[i]=1;
             }
-            return std::make_pair<const char *, unsigned int>((const char *) pattern, static_cast<unsigned int>(kmerSize));
+            return std::make_pair<const char *, unsigned int>(const_cast<const char*>(pattern), static_cast<unsigned int>(kmerSize));
 
 //            Debug(Debug::ERROR) << "Did not find spaced pattern for kmerSize: " << kmerSize << ". \n";
 //            Debug(Debug::ERROR) << "Please report this bug to the developer\n";
@@ -162,7 +164,7 @@ std::pair<const char *, unsigned int> Sequence::getSpacedPattern(bool spaced, un
     if (pair.second > 0) {
         memcpy(pattern, pair.first, pair.second * sizeof(char));
     }
-    return std::make_pair<const char *, unsigned int>(pattern, static_cast<unsigned int>(pair.second));
+    return std::make_pair<const char *, unsigned int>(const_cast<const char*>(pattern), static_cast<unsigned int>(pair.second));
 #undef CASE
 }
 
@@ -240,9 +242,11 @@ void Sequence::mapSequence(size_t id, unsigned int dbKey, std::pair<const unsign
         || Parameters::isEqualDbtype( this->seqType,Parameters::DBTYPE_NUCLEOTIDES)
         || Parameters::isEqualDbtype(this->seqType, Parameters::DBTYPE_PROFILE_STATE_SEQ)){
         this->L = data.second;
-        for (int aa = 0; aa < this->L; aa++) {
-            this->int_sequence[aa] = data.first[aa];
+        if(this->L >= static_cast<int>(maxLen)){
+            numSequence = static_cast<unsigned char *>(realloc(numSequence, this->L+1));
+            maxLen = this->L;
         }
+        memcpy(this->numSequence, data.first, this->L);
     } else {
         Debug(Debug::ERROR) << "Invalid sequence type!\n";
         EXIT(EXIT_FAILURE);
@@ -250,13 +254,13 @@ void Sequence::mapSequence(size_t id, unsigned int dbKey, std::pair<const unsign
     currItPos = -1;
 }
 
-void Sequence::mapProfileStateSequence(const char * sequence, unsigned int seqLen){
+void Sequence::mapProfileStateSequence(const char * profileStateSeq, unsigned int seqLen){
     size_t l = 0;
     size_t pos = 0;
-    unsigned char curr = sequence[pos];
+    unsigned char curr = profileStateSeq[pos];
     while (curr != '\0' && l < seqLen){
 
-        this->int_sequence[l]  = curr - 1;
+        this->numSequence[l]  = curr - 1;
 
         l++;
         if (l > maxLen){
@@ -264,28 +268,26 @@ void Sequence::mapProfileStateSequence(const char * sequence, unsigned int seqLe
             EXIT(EXIT_FAILURE);
         }
         pos++;
-        curr  = sequence[pos];
+        curr  = profileStateSeq[pos];
     }
     this->L = l;
 }
 
 
 
-void Sequence::mapProfile(const char * sequence, bool mapScores, unsigned int seqLen){
-    char * data = (char *) sequence;
+void Sequence::mapProfile(const char * profileData, bool mapScores, unsigned int seqLen){
+    char * data = (char *) profileData;
     size_t currPos = 0;
     float scoreBias = 0.0;
     // if no data exists
     {
         size_t l = 0;
         while (data[currPos] != '\0' && l < maxLen  && l < seqLen){
-            int nullCnt = 0;
             for (size_t aa_idx = 0; aa_idx < PROFILE_AA_SIZE; aa_idx++) {
                 // shift bytes back (avoids NULL byte)
 //            short value = static_cast<short>( ^ mask);
                 profile[l * PROFILE_AA_SIZE + aa_idx] = scoreUnmask(data[currPos + aa_idx]);
                 //value * 4;
-                nullCnt += (profile[l * PROFILE_AA_SIZE + aa_idx]==0.0);
             }
 
             float sumProb = 0.0;
@@ -295,17 +297,12 @@ void Sequence::mapProfile(const char * sequence, bool mapScores, unsigned int se
             if(sumProb > 0.9){
                 MathUtil::NormalizeTo1(&profile[l * PROFILE_AA_SIZE], PROFILE_AA_SIZE);
             }
-            if(nullCnt==PROFILE_AA_SIZE) {
-                for (size_t aa_idx = 0; aa_idx < PROFILE_AA_SIZE; aa_idx++) {
-                    profile[l * PROFILE_AA_SIZE + aa_idx] = 0.0;
-                }
-            }
 
             unsigned char queryLetter = data[currPos + PROFILE_AA_SIZE];
             // read query sequence
-            int_sequence[l] = queryLetter; // index 0 is the highst scoring one
+            numSequence[l] = queryLetter; // index 0 is the highst scoring one
             unsigned char consensusLetter = data[currPos + PROFILE_AA_SIZE+1];
-            int_consensus_sequence[l] = consensusLetter;
+            numConsensusSequence[l] = consensusLetter;
             unsigned short neff = data[currPos + PROFILE_AA_SIZE+2];
             neffM[l] = MathUtil::convertNeffToFloat(neff);
             l++;
@@ -374,8 +371,8 @@ void Sequence::mapProfile(const char * sequence, bool mapScores, unsigned int se
 
 
 template <int T>
-void Sequence::mapProfileState(const char * sequence, unsigned int seqLen){
-    mapProfile(sequence, false, seqLen);
+void Sequence::mapProfileState(const char * profileState, unsigned int seqLen){
+    mapProfile(profileState, false, seqLen);
 
     SubstitutionMatrixProfileStates * profileStateMat = (SubstitutionMatrixProfileStates *) subMat;
     // compute avg. amino acid probability
@@ -429,7 +426,7 @@ void Sequence::mapProfileState(const char * sequence, unsigned int seqLen){
 
             memcpy(&profile_index[l * profile_row_size], &indexArray, T * sizeof(int) );
             // create consensus sequence
-    //        int_sequence[l] = indexArray[0]; // index 0 is the highst scoring one
+    //        sequence[l] = indexArray[0]; // index 0 is the highst scoring one
         }
 
         // write alignment profile
@@ -481,16 +478,15 @@ void Sequence::nextProfileKmer() {
 void Sequence::mapSequence(const char * sequence, unsigned int dataLen){
     size_t l = 0;
     char curr = sequence[l];
+    if(dataLen >= maxLen){
+        numSequence = static_cast<unsigned char*>(realloc(numSequence, dataLen+1));
+        maxLen = dataLen;
+    }
     while (curr != '\0' && curr != '\n' && l < dataLen &&  l < maxLen){
-        int intaa = subMat->aa2int[(int)curr];
-        this->int_sequence[l] = intaa;
+        this->numSequence[l] = subMat->aa2num[static_cast<int>(curr)];
         l++;
         curr  = sequence[l];
     }
-
-    if(l > maxLen && curr != '\0' && curr != '\n' ){
-        Debug(Debug::INFO) << "Entry " << dbKey << " is longer than max seq. len " << maxLen << "\n";
-    }
     this->L = l;
 }
 
@@ -498,7 +494,7 @@ void Sequence::printPSSM(){
     printf("Query profile of sequence %d\n", dbKey);
     printf("Pos ");
     for(size_t aa = 0; aa < PROFILE_AA_SIZE; aa++) {
-        printf("%3c ", subMat->int2aa[aa]);
+        printf("%3c ", subMat->num2aa[aa]);
     }
     printf("Neff \n");
     for(int i = 0; i < this->L; i++){
@@ -515,7 +511,7 @@ void Sequence::printProfileStatePSSM(){
     printf("Query profile of sequence %d\n", dbKey);
     printf("Pos ");
     for(int aa = 0; aa < subMat->alphabetSize; aa++) {
-        printf("%3c ", subMat->int2aa[aa]);
+        printf("%3c ", subMat->num2aa[aa]);
     }
     printf("\n");
     for(int i = 0; i < this->L; i++){
@@ -533,7 +529,7 @@ void Sequence::printProfile(){
     printf("Query profile of sequence %d\n", dbKey);
     printf("Pos ");
     for(size_t aa = 0; aa < PROFILE_AA_SIZE; aa++) {
-        printf("%3c ", subMat->int2aa[aa]);
+        printf("%3c ", subMat->num2aa[aa]);
     }
     printf("\n");
     for(int i = 0; i < this->L; i++){
@@ -550,7 +546,7 @@ void Sequence::reverse() {
         short        tmpScore[PROFILE_AA_SIZE*4];
         unsigned int tmpIndex[PROFILE_AA_SIZE*4];
 
-        int i_curr = 0 * profile_row_size;
+        int i_curr = 0;
         int j_curr = (this->L - 1)  * profile_row_size;
 
         for (int i = 0; i < this->L/2; i++) {
@@ -564,13 +560,13 @@ void Sequence::reverse() {
             j_curr -= profile_row_size;
         }
     }
-    std::reverse(int_sequence, int_sequence + this->L); // reverse sequence
+    std::reverse(numSequence, numSequence + this->L); // reverse sequence
 }
 
 void Sequence::print() {
     std::cout << "Sequence ID " << this->id << "\n";
     for(int i = 0; i < this->L; i++){
-        printf("%c",subMat->int2aa[this->int_sequence[i]]);
+        printf("%c",subMat->num2aa[this->numSequence[i]]);
     }
     std::cout << std::endl;
 }
@@ -578,7 +574,7 @@ void Sequence::print() {
 void extractProfileData(const char* data, const BaseMatrix &subMat, const int offset, std::string &result) {
     size_t i = 0;
     while (data[i] != '\0'){
-        result.append(1, subMat.int2aa[(int)data[i + Sequence::PROFILE_AA_SIZE + offset]]);
+        result.append(1, subMat.num2aa[(int)data[i + Sequence::PROFILE_AA_SIZE + offset]]);
         i += Sequence::PROFILE_READIN_SIZE;
     }
 }
diff --git a/src/commons/Sequence.h b/src/commons/Sequence.h
index 676f307..ffe40c3 100644
--- a/src/commons/Sequence.h
+++ b/src/commons/Sequence.h
@@ -8,12 +8,12 @@
 #include "MathUtil.h"
 #include "BaseMatrix.h"
 #include "Parameters.h"
+#include "ScoreMatrix.h"
 
 #include <cstdint>
 #include <cstddef>
 #include <utility>
-
-struct ScoreMatrix;
+#include <simd/simd.h>
 
 const int8_t seed_4[]        = {1, 1, 1, 1};
 const int8_t spaced_seed_4[] = {1, 1, 1, 0, 1};
@@ -86,29 +86,34 @@ class Sequence {
     void mapSequence(size_t id, unsigned int dbKey, std::pair<const unsigned char *, const unsigned int> data);
 
     // map profile HMM, *data points to start position of Profile
-    void mapProfile(const char *sequence, bool mapScores,  unsigned int seqLen);
+    void mapProfile(const char *profileData, bool mapScores,  unsigned int seqLen);
 
     // mixture of library and profile prob
     template <int T>
-    void mapProfileState(const char *sequence, unsigned int seqLen);
+    void mapProfileState(const char *profileState, unsigned int seqLen);
 
     // map the profile state sequence
-    void mapProfileStateSequence(const char *sequence, unsigned int seqLen);
+    void mapProfileStateSequence(const char *profileStateSeq, unsigned int seqLen);
 
     // checks if there is still a k-mer left
     bool hasNextKmer() {
         return (((currItPos + 1) + this->spacedPatternSize) <= this->L);
     }
 
+    // k-mer contains x, is only field aftter nextKmer
+    inline bool kmerContainsX(){
+        return kmerHasX != 0;
+    }
+
     // returns next k-mer
-    inline const int * nextKmer() {
+    inline const unsigned char * nextKmer() {
         if (hasNextKmer() == false) {
             return 0;
         }
 
         currItPos++;
-        const int *posToRead = int_sequence + currItPos;
-        int *currWindowPos = kmerWindow;
+        const unsigned char *posToRead = numSequence + currItPos;
+        unsigned char *currWindowPos = kmerWindow;
         switch (this->kmerSize){
             case 6:
                 kmerWindow[0] = posToRead[aaPosInSpacedPattern[0]];
@@ -127,6 +132,45 @@ class Sequence {
                 kmerWindow[5] = posToRead[aaPosInSpacedPattern[5]];
                 kmerWindow[6] = posToRead[aaPosInSpacedPattern[6]];
                 break;
+            case 10:
+                kmerWindow[0] = posToRead[aaPosInSpacedPattern[0]];
+                kmerWindow[1] = posToRead[aaPosInSpacedPattern[1]];
+                kmerWindow[2] = posToRead[aaPosInSpacedPattern[2]];
+                kmerWindow[3] = posToRead[aaPosInSpacedPattern[3]];
+                kmerWindow[4] = posToRead[aaPosInSpacedPattern[4]];
+                kmerWindow[5] = posToRead[aaPosInSpacedPattern[5]];
+                kmerWindow[6] = posToRead[aaPosInSpacedPattern[6]];
+                kmerWindow[7] = posToRead[aaPosInSpacedPattern[7]];
+                kmerWindow[8] = posToRead[aaPosInSpacedPattern[8]];
+                kmerWindow[9] = posToRead[aaPosInSpacedPattern[9]];
+                break;
+            case 11:
+                kmerWindow[0] = posToRead[aaPosInSpacedPattern[0]];
+                kmerWindow[1] = posToRead[aaPosInSpacedPattern[1]];
+                kmerWindow[2] = posToRead[aaPosInSpacedPattern[2]];
+                kmerWindow[3] = posToRead[aaPosInSpacedPattern[3]];
+                kmerWindow[4] = posToRead[aaPosInSpacedPattern[4]];
+                kmerWindow[5] = posToRead[aaPosInSpacedPattern[5]];
+                kmerWindow[6] = posToRead[aaPosInSpacedPattern[6]];
+                kmerWindow[7] = posToRead[aaPosInSpacedPattern[7]];
+                kmerWindow[8] = posToRead[aaPosInSpacedPattern[8]];
+                kmerWindow[9] = posToRead[aaPosInSpacedPattern[9]];
+                kmerWindow[10] = posToRead[aaPosInSpacedPattern[10]];
+                break;
+            case 12:
+                kmerWindow[0] = posToRead[aaPosInSpacedPattern[0]];
+                kmerWindow[1] = posToRead[aaPosInSpacedPattern[1]];
+                kmerWindow[2] = posToRead[aaPosInSpacedPattern[2]];
+                kmerWindow[3] = posToRead[aaPosInSpacedPattern[3]];
+                kmerWindow[4] = posToRead[aaPosInSpacedPattern[4]];
+                kmerWindow[5] = posToRead[aaPosInSpacedPattern[5]];
+                kmerWindow[6] = posToRead[aaPosInSpacedPattern[6]];
+                kmerWindow[7] = posToRead[aaPosInSpacedPattern[7]];
+                kmerWindow[8] = posToRead[aaPosInSpacedPattern[8]];
+                kmerWindow[9] = posToRead[aaPosInSpacedPattern[9]];
+                kmerWindow[10] = posToRead[aaPosInSpacedPattern[10]];
+                kmerWindow[11] = posToRead[aaPosInSpacedPattern[11]];
+                break;
             case 13:
                 kmerWindow[0] = posToRead[aaPosInSpacedPattern[0]];
                 kmerWindow[1] = posToRead[aaPosInSpacedPattern[1]];
@@ -357,7 +401,13 @@ class Sequence {
                 }
                 break;
         }
+        kmerHasX = 0;
 
+        const simd_int xChar = simdi8_set(subMat->aa2num[static_cast<int>('X')]);
+        for(size_t i = 0; i < simdKmerRegisterCnt; i++){
+            simd_int kmer = simdi_load((((simd_int *) kmerWindow) + i));
+            kmerHasX |= static_cast<unsigned int>(simdi8_movemask(simdi8_eq(kmer, xChar)));
+        }
         if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) ||
             Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_PROFILE_STATE_PROFILE)) {
             nextProfileKmer();
@@ -367,7 +417,7 @@ class Sequence {
             return kmerWindow;
         }
 
-        return (const int *)kmerWindow;
+        return (const unsigned char *)kmerWindow;
     }
 
     // resets the sequence position pointer to the start of the sequence
@@ -402,10 +452,10 @@ class Sequence {
     int L;
 
     // each amino acid coded as integer
-    int *int_sequence;
+    unsigned char *numSequence;
 
     // each consensus amino acid as integer (PROFILE ONLY)
-    int *int_consensus_sequence;
+    unsigned char *numConsensusSequence;
 
     // Contains profile information
     short           *profile_score;
@@ -421,7 +471,7 @@ class Sequence {
     static const size_t PROFILE_READIN_SIZE = 23;
     ScoreMatrix **profile_matrix;
     // Memory layout of this profile is qL * AA
-    //   Query lenght
+    //   Query length
     // A  -1  -3  -2  -1  -4  -2  -2  -3  -1  -3  -2  -2   7  -1  -2  -1  -1  -2  -5  -3
     // C  -1  -4   2   5  -3  -2   0  -3   1  -3  -2   0  -1   2   0   0  -1  -3  -4  -2
     // ...
@@ -505,8 +555,14 @@ class Sequence {
     // kmer Size
     unsigned int kmerSize;
 
+    // simd kmer size
+    unsigned int simdKmerRegisterCnt;
+
     // sequence window will be filled by newxtKmer (needed for spaced patterns)
-    int *kmerWindow;
+    unsigned char *kmerWindow;
+
+    // set if kmer contains X
+    unsigned int kmerHasX;
 
     // stores position of residues in sequence
     unsigned char *aaPosInSpacedPattern;
diff --git a/src/commons/SubstitutionMatrix.cpp b/src/commons/SubstitutionMatrix.cpp
index d6d35ae..5ba72bd 100644
--- a/src/commons/SubstitutionMatrix.cpp
+++ b/src/commons/SubstitutionMatrix.cpp
@@ -43,7 +43,7 @@ SubstitutionMatrix::SubstitutionMatrix(const char *filename, float bitFactor, fl
         // read amino acid substitution matrix from file
         std::string fileName(parsedMatrix.first.c_str());
         matrixName = Util::base_name(fileName, "/\\");
-        if (fileName.substr(fileName.length() - 4, 4).compare(".out") != 0) {
+        if (fileName.length() < 4 || fileName.substr(fileName.length() - 4, 4).compare(".out") != 0) {
             Debug(Debug::ERROR) << "Invalid format of the substitution matrix input file! Only .out files are accepted.\n";
             EXIT(EXIT_FAILURE);
         }
@@ -67,7 +67,7 @@ SubstitutionMatrix::SubstitutionMatrix(const char *filename, float bitFactor, fl
 
     setupLetterMapping();
 
-    //print(probMatrix, int2aa, alphabetSize);
+    //print(probMatrix, num2aa, alphabetSize);
     generateSubMatrix(probMatrix, subMatrixPseudoCounts, subMatrix, alphabetSize, true, bitFactor, scoreBias);
 }
 
@@ -104,7 +104,7 @@ bool SubstitutionMatrix::estimateLambdaAndBackground(const double **scoreMatrix,
 
 
 void SubstitutionMatrix::calcLocalAaBiasCorrection(const BaseMatrix *m,
-                                                   const int *int_sequence,
+                                                   const unsigned char *int_sequence,
                                                    const int N,
                                                    float *compositionBias) {
     const int windowSize = 40;
@@ -301,45 +301,45 @@ void SubstitutionMatrix::setupLetterMapping(){
             case 'W':
             case 'Y':
             case 'X':
-                this->aa2int[static_cast<int>(letter)] = this->aa2int[static_cast<int>(upperLetter)];
+                this->aa2num[static_cast<int>(letter)] = this->aa2num[static_cast<int>(upperLetter)];
                 break;
             case 'J':
-                this->aa2int[static_cast<int>(letter)] = this->aa2int[(int)'L'];
+                this->aa2num[static_cast<int>(letter)] = this->aa2num[(int)'L'];
                 break;
             case 'U':
             case 'O':
-                this->aa2int[static_cast<int>(letter)] = this->aa2int[(int)'X'];
+                this->aa2num[static_cast<int>(letter)] = this->aa2num[(int)'X'];
                 break;
-            case 'Z': this->aa2int[static_cast<int>(letter)] = this->aa2int[(int)'E']; break;
-            case 'B': this->aa2int[static_cast<int>(letter)] = this->aa2int[(int)'D']; break;
+            case 'Z': this->aa2num[static_cast<int>(letter)] = this->aa2num[(int)'E']; break;
+            case 'B': this->aa2num[static_cast<int>(letter)] = this->aa2num[(int)'D']; break;
             default:
-                this->aa2int[static_cast<int>(letter)] = this->aa2int[(int)'X'];
+                this->aa2num[static_cast<int>(letter)] = this->aa2num[(int)'X'];
                 break;
         }
     }
 }
 
-int SubstitutionMatrix::parseAlphabet(char *word, char *int2aa, int *aa2int) {
+int SubstitutionMatrix::parseAlphabet(char *word, char *num2aa, int *aa2num) {
     char *charReader = word;
     int minAAInt = INT_MAX;
     // find amino acid with minimal int value
     while (isalpha(*charReader)) {
         const char aa = *charReader;
-        const int intAA = aa2int[static_cast<int>(aa)];
+        const int intAA = aa2num[static_cast<int>(aa)];
         minAAInt = std::max(minAAInt, intAA);
         charReader++;
     }
     if(minAAInt==-1){
 
     }
-    char minAAChar = int2aa[minAAInt];
+    char minAAChar = num2aa[minAAInt];
     // do alphabet reduction
     charReader = word;
     while (isalpha(*charReader)) {
         const char aa = *charReader;
-        const int intAA = aa2int[static_cast<int>(aa)];
-        aa2int[static_cast<int>(aa)] = minAAInt;
-        int2aa[intAA] = minAAChar;
+        const int intAA = aa2num[static_cast<int>(aa)];
+        aa2num[static_cast<int>(aa)] = minAAInt;
+        num2aa[intAA] = minAAChar;
         charReader++;
     }
     return minAAInt;
@@ -381,7 +381,7 @@ void SubstitutionMatrix::readProbMatrix(const std::string &matrixData, const boo
                 Debug(Debug::ERROR) << "First element in probability line must be an alphabet letter.\n";
                 EXIT(EXIT_FAILURE);
             }
-            int aa = aa2int[toupper(words[0][0])];
+            int aa = static_cast<int>(aa2num[toupper(words[0][0])]);
             for (int i = 0; i < alphabetSize; i++) {
                 double f = strtod(words[i + 1], NULL);
                 probMatrix[aa][i] = f; // divided by 2 because we scale bit/2 ot bit
@@ -391,7 +391,7 @@ void SubstitutionMatrix::readProbMatrix(const std::string &matrixData, const boo
     bool xIsPositive = false;
     if( containsX == true ){
         for (int j = 0; j < alphabetSize; j++) {
-            int xIndex = aa2int[(int)'X'];
+            int xIndex = static_cast<int>(aa2num[static_cast<int>('X')]);
             if ((probMatrix[xIndex][j] > 0) || (probMatrix[j][xIndex] > 0)) {
                 xIsPositive = true;
                 break;
@@ -411,11 +411,11 @@ void SubstitutionMatrix::readProbMatrix(const std::string &matrixData, const boo
             Debug(Debug::ERROR) << "Computing inverse of substitution matrix failed\n";
             EXIT(EXIT_FAILURE);
         }
-        pBack[aa2int[(int)'X']]=ANY_BACK;
+        pBack[static_cast<int>(aa2num[static_cast<int>('X')])]=ANY_BACK;
     }
     if(xIsPositive == false){
         for (int i = 0; i < alphabetSize - 1; i++) {
-            pBack[i] = pBack[i] * (1.0 - pBack[aa2int[(int)'X']]);
+            pBack[i] = pBack[i] * (1.0 - pBack[static_cast<int>(aa2num[static_cast<int>('X')])]);
         }
     }
     // Reconstruct Probability Sab=(Pab/Pa*Pb) -> Pab = exp(Sab) * Pa * Pb
@@ -447,12 +447,12 @@ std::pair<int, bool> SubstitutionMatrix::setAaMappingDetectAlphSize(std::string
                     EXIT(EXIT_FAILURE);
                 }
                 int aa = toupper(words[i][0]);
-                aa2int[aa] = i;
-                int2aa[i] = aa;
+                aa2num[aa] = static_cast<unsigned char>(i);
+                num2aa[i] = aa;
                 if (aa == 'X') {
                     containsX = true;
                 }
-//                column_aa[i] = parseAlphabet(words[i], int2aa, aa2int);
+//                column_aa[i] = parseAlphabet(words[i], num2aa, aa2num);
             }
             alphabetSize = wordCnt;
             return std::make_pair(alphabetSize, containsX);
diff --git a/src/commons/SubstitutionMatrix.h b/src/commons/SubstitutionMatrix.h
index 3c5ddd2..103de81 100644
--- a/src/commons/SubstitutionMatrix.h
+++ b/src/commons/SubstitutionMatrix.h
@@ -23,7 +23,7 @@ class SubstitutionMatrix: public BaseMatrix {
     
         virtual double getBackgroundProb(size_t aa_index) { return pBack[aa_index]; }
 
-        static void calcLocalAaBiasCorrection(const BaseMatrix *m ,const int *int_sequence, const int N, float *compositionBias);
+        static void calcLocalAaBiasCorrection(const BaseMatrix *m ,const unsigned char *int_sequence, const int N, float *compositionBias);
         static void calcProfileProfileLocalAaBiasCorrection(short *profileScores,
                                                 const size_t profileAASize,
                                                 const int N,
@@ -61,9 +61,9 @@ class SubstitutionMatrix: public BaseMatrix {
             char * matrixData = new char[range*range];
             for(size_t i = 0; i < range; i++) {
                 matrix[i] = matrixData+(i*range);
-                int curr_i = submat.aa2int[asciiStart+i];
+                int curr_i = static_cast<int>(submat.aa2num[asciiStart+i]);
                 for (size_t j = 0; j < range; j++) {
-                    int curr_j = submat.aa2int[asciiStart+j];
+                    int curr_j = static_cast<int>(submat.aa2num[asciiStart+j]);
                     matrix[i][j] = static_cast<char>(submat.subMatrix[curr_i][curr_j]);
                 }
             }
@@ -73,7 +73,7 @@ class SubstitutionMatrix: public BaseMatrix {
         }
 
 private:
-    int parseAlphabet(char * word, char * int2aa, int * aa2int);
+    int parseAlphabet(char * word, char * num2aa, int * aa2num);
 
     void readProbMatrix(const std::string &matrixData, bool containsX);
 
diff --git a/src/commons/SubstitutionMatrixProfileStates.h b/src/commons/SubstitutionMatrixProfileStates.h
index d3adaf0..038298d 100644
--- a/src/commons/SubstitutionMatrixProfileStates.h
+++ b/src/commons/SubstitutionMatrixProfileStates.h
@@ -1,9 +1,5 @@
 #ifndef SubstitutionMatrixProfileStates_H
 #define SubstitutionMatrixProfileStates_H
-#include <fstream>
-#include <string>
-#include <vector>
-
 #include "BaseMatrix.h"
 #include "Debug.h"
 #include "ProfileStates.h"
@@ -15,43 +11,43 @@ class SubstitutionMatrixProfileStates : public BaseMatrix {
                                     float **rMatrix, float bitFactor, float scoreBias,
                                     int libAlphabetSize) {
 //        alphabetSize = 32;
-        int2aa[0] = 'A';
-        int2aa[1] = 'C';
-        int2aa[2] = 'D';
-        int2aa[3] = 'E';
-        int2aa[4] = 'F';
-        int2aa[5] = 'G';
-        int2aa[6] = 'H';
-        int2aa[7] = 'I';
-        int2aa[8] = 'K';
-        int2aa[9] = 'L';
-        int2aa[10] = 'M';
-        int2aa[11] = 'N';
-        int2aa[12] = 'P';
-        int2aa[13] = 'Q';
-        int2aa[14] = 'R';
-        int2aa[15] = 'S';
-        int2aa[16] = 'T';
-        int2aa[17] = 'V';
-        int2aa[18] = 'W';
-        int2aa[19] = 'Y';
-        int2aa[20] = 'X';
-        int2aa[21] = 'Z';
-        int2aa[22] = '[';
-        int2aa[23] = '\\';
-        int2aa[24] = ']';
-        int2aa[25] = '^';
-        int2aa[26] = '_';
-        int2aa[27] = '`';
-        int2aa[28] = 'a';
-        int2aa[29] = 'b';
-        int2aa[30] = 'c';
-        int2aa[31] = 'd';
+        num2aa[0] = 'A';
+        num2aa[1] = 'C';
+        num2aa[2] = 'D';
+        num2aa[3] = 'E';
+        num2aa[4] = 'F';
+        num2aa[5] = 'G';
+        num2aa[6] = 'H';
+        num2aa[7] = 'I';
+        num2aa[8] = 'K';
+        num2aa[9] = 'L';
+        num2aa[10] = 'M';
+        num2aa[11] = 'N';
+        num2aa[12] = 'P';
+        num2aa[13] = 'Q';
+        num2aa[14] = 'R';
+        num2aa[15] = 'S';
+        num2aa[16] = 'T';
+        num2aa[17] = 'V';
+        num2aa[18] = 'W';
+        num2aa[19] = 'Y';
+        num2aa[20] = 'X';
+        num2aa[21] = 'Z';
+        num2aa[22] = '[';
+        num2aa[23] = '\\';
+        num2aa[24] = ']';
+        num2aa[25] = '^';
+        num2aa[26] = '_';
+        num2aa[27] = '`';
+        num2aa[28] = 'a';
+        num2aa[29] = 'b';
+        num2aa[30] = 'c';
+        num2aa[31] = 'd';
         alphabetSize = 21;
         initMatrixMemory(alphabetSize);
 
         for (int i = 0; i < alphabetSize; ++i){
-            aa2int[(int)int2aa[i]] = i;
+            aa2num[static_cast<int>(num2aa[i])] = static_cast<unsigned char>(i);
         }
 
         this->matrixName = matrixName;
@@ -76,7 +72,7 @@ class SubstitutionMatrixProfileStates : public BaseMatrix {
         ps = new ProfileStates(libAlphabetSize,this->pBack);
         this->scoreNormalization = ps->getScoreNormalization();
         this->bitFactor = bitFactor * scoreNormalization;
-        //this->int2aa[toIndex] = this->int2aa[fromIndex];
+        //this->num2aa[toIndex] = this->num2aa[fromIndex];
 
         this->subMatrix = new short*[alphabetSize];
         for (int i = 0; i<alphabetSize; i++) {
diff --git a/src/commons/Util.cpp b/src/commons/Util.cpp
index 8000fab..d7d5229 100644
--- a/src/commons/Util.cpp
+++ b/src/commons/Util.cpp
@@ -274,6 +274,37 @@ void Util::parseKey(const char *data, char *key) {
     key[keySize] = '\0';
 }
 
+char* Util::fastSeqIdToBuffer(float seqId, char* buffer) {
+    if (seqId == 1.0) {
+        *(buffer) = '1';
+        buffer++;
+        *(buffer) = '.';
+        buffer++;
+        *(buffer) = '0';
+        buffer++;
+        *(buffer) = '0';
+        buffer++;
+        *(buffer) = '0';
+        buffer++;
+        *(buffer) = '\0';
+    } else {
+        *(buffer) = '0';
+        buffer++;
+        *(buffer) = '.';
+        buffer++;
+        if (seqId < 0.10) {
+            *(buffer) = '0';
+            buffer++;
+        }
+        if (seqId < 0.01) {
+            *(buffer) = '0';
+            buffer++;
+        }
+        buffer = Itoa::i32toa_sse2((int)(seqId * 1000), buffer);
+    }
+    return buffer;
+}
+
 std::vector<std::string> Util::split(const std::string &str, const std::string &sep) {
     std::vector<std::string> arr;
 
@@ -505,8 +536,7 @@ bool Util::canBeCovered(const float covThr, const int covMode, float queryLength
         case Parameters::COV_MODE_QUERY:
             return ((targetLength / queryLength) >= covThr);
         case Parameters::COV_MODE_TARGET:
-            // No assumptions possible without the alignment length
-            return true;
+            return ((queryLength/targetLength) >= covThr) ;
         case Parameters::COV_MODE_LENGTH_QUERY:
             return ((targetLength / queryLength) >= covThr) && (targetLength / queryLength) <= 1.0;
         case Parameters::COV_MODE_LENGTH_TARGET:
@@ -573,7 +603,7 @@ uint64_t Util::revComplement(const uint64_t kmer, const int k) {
     // create lookup (set 16 bytes in 128 bit)
     // a lookup entry at the index of two nucleotides (4 bit) describes the reverse
     // complement of these two nucleotide in the higher 4 bits (lookup1) or in the lower 4 bits (lookup2)
-#define c (char)
+#define c (signed char)
     __m128i lookup1 = _mm_set_epi8(c(0x50),c(0x10),c(0xD0),c(0x90),c(0x40),c(0x00),c(0xC0),c(0x80),
                                    c(0x70),c(0x30),c(0xF0),c(0xB0),c(0x60),c(0x20),c(0xE0),c(0xA0));
     __m128i lookup2 = _mm_set_epi8(c(0x05),c(0x01),c(0x0D),c(0x09),c(0x04),c(0x00),c(0x0C),c(0x08),
@@ -589,30 +619,14 @@ uint64_t Util::revComplement(const uint64_t kmer, const int k) {
 #undef c
 
     // use _mm_shuffle_epi8 to look up reverse complement
-#ifdef NEON
-    kmer1 = vreinterpretq_m128i_u8(vqtbl1q_u8(vreinterpretq_u8_m128i(lookup1),vreinterpretq_u8_m128i(kmer1)));
-#else
-    kmer1 =_mm_shuffle_epi8(lookup1, kmer1);
-#endif
-
-
-#ifdef NEON
-    kmer2 = vreinterpretq_m128i_u8(vqtbl1q_u8(vreinterpretq_u8_m128i(lookup2),vreinterpretq_u8_m128i(kmer2)));
-#else
+    kmer1 = _mm_shuffle_epi8(lookup1, kmer1);
     kmer2 = _mm_shuffle_epi8(lookup2, kmer2);
-#endif
-
 
     // _mm_or_si128: bitwise OR
     x = _mm_or_si128(kmer1, kmer2);
 
     // set upper 8 bytes to 0 and revert order of lower 8 bytes
-
-#ifdef NEON
-    x = vreinterpretq_m128i_u8(vqtbl1q_u8(vreinterpretq_u8_m128i(x),vreinterpretq_u8_m128i(upper)));
-#else
     x = _mm_shuffle_epi8(x, upper);
-#endif
 
     // shift out the unused nucleotide positions (1 <= k <=32 )
     // broadcast 128 bit to 64 bit
diff --git a/src/commons/Util.h b/src/commons/Util.h
index e4c51a7..3839cd2 100644
--- a/src/commons/Util.h
+++ b/src/commons/Util.h
@@ -109,6 +109,8 @@ class Util {
         return sign * val;
     }
 
+    static char* fastSeqIdToBuffer(float seqId, char* buffer);
+
     static bool isNumber(const std::string& s)
     {
         std::string::const_iterator it = s.begin();
diff --git a/src/commons/itoa.h b/src/commons/itoa.h
index 34c3c41..a79c316 100644
--- a/src/commons/itoa.h
+++ b/src/commons/itoa.h
@@ -22,14 +22,36 @@ THE SOFTWARE.
  */
 // SSE2 implementation according to http://0x80.pl/articles/sse-itoa.html
 // Modifications: (1) fix incorrect digits (2) accept all ranges (3) write to user provided buffer.
+#include <cstdint>
+
+#if defined(WASM) || defined(__ALTIVEC__)
+#include <cstdio>
+class Itoa{
+public:
+    static char* u32toa_sse2(uint32_t value, char* buffer) {
+        return buffer + sprintf(buffer, "%d", value) + 1;
+    }
+    static char* i32toa_sse2(int32_t value, char* buffer) {
+        return buffer + sprintf(buffer, "%d", value) + 1;
+    }
+    static char* u64toa_sse2(uint64_t value, char* buffer) {
+        return buffer + sprintf(buffer, "%zu", value) + 1;
+    }
+    static char* i64toa_sse2(uint64_t value, char* buffer) {
+        return buffer + sprintf(buffer, "%zu", value) + 1;
+    }
+};
+#else
 #ifdef NEON
 #include "sse2neon.h"
 #else
+#ifdef __ALTIVEC__
+#include "sse2altivec.h"
+#else
 #include <emmintrin.h>
 #endif
+#endif
 
-#include <stdint.h>
-#include "itoa.h"
 
 #define ALIGN_PRE
 #define ALIGN_SUF  __attribute__ ((aligned(16)))
@@ -64,6 +86,10 @@ ALIGN_PRE static const uint16_t kShiftPowersVector[8] ALIGN_SUF = {
 ALIGN_PRE static const uint16_t k10Vector[8] ALIGN_SUF = { 10, 10, 10, 10, 10, 10, 10, 10 };
 ALIGN_PRE static const char kAsciiZero[16] ALIGN_SUF = { '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0' };
 
+// FIXME: NEON throws many warnings due to the reinterpret casts
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+
 class Itoa{
 public:
     static inline __m128i Convert8DigitsSSE2(uint32_t value) {
@@ -302,8 +328,11 @@ class Itoa{
     }
 
 };
+
+#pragma GCC diagnostic pop
+
 #undef ALIGN_PRE
 #undef ALIGN_SUF
 
-
+#endif
 #endif
diff --git a/src/linclust/KmerIndex.h b/src/linclust/KmerIndex.h
index 8636a54..5f73cbe 100644
--- a/src/linclust/KmerIndex.h
+++ b/src/linclust/KmerIndex.h
@@ -5,7 +5,6 @@
 // storage for k-mers
 #include "MathUtil.h"
 #include <string>
-#include <iostream>
 #include <algorithm>
 #include <fcntl.h>
 #include <sys/mman.h>
@@ -149,7 +148,7 @@ class KmerIndex{
         }
         entries[writingPosition].id = id;
         entries[writingPosition].kmerOffset = kmer - kmerStartRange;
-        entries[writingPosition].kmerOffset = (isReverse) ? BIT_SET(15, entries[writingPosition].kmerOffset) :  entries[writingPosition].kmerOffset;
+        entries[writingPosition].kmerOffset = (isReverse) ? BIT_SET(entries[writingPosition].kmerOffset, 15) :  entries[writingPosition].kmerOffset;
         entries[writingPosition].pos = pos;
         entries[writingPosition].seqLen = seqLen;
         writingPosition++;
@@ -218,9 +217,9 @@ class KmerIndex{
             if(TYPE==Parameters::DBTYPE_NUCLEOTIDES){
                 kmerIdx = BIT_CLEAR(kmerIdx, 15);
                 Indexer::printKmer(kmerIdx, kmerSize);
-//                indexer.printKmer(kmer.kmer, kmerSize, mat->int2aa);
+//                indexer.printKmer(kmer.kmer, kmerSize, mat->num2aa);
             }else{
-                indexer.printKmer(kmerIdx, kmerSize, mat->int2aa);
+                indexer.printKmer(kmerIdx, kmerSize, mat->num2aa);
             }
             Debug(Debug::INFO) << "\t";
             Debug(Debug::INFO) << kmerIdx << "\t";
diff --git a/src/linclust/LinsearchIndexReader.cpp b/src/linclust/LinsearchIndexReader.cpp
index a3eaf9c..0ade068 100644
--- a/src/linclust/LinsearchIndexReader.cpp
+++ b/src/linclust/LinsearchIndexReader.cpp
@@ -255,7 +255,7 @@ std::string LinsearchIndexReader::findIncompatibleParameter(DBReader<unsigned in
         return "maxSeqLen";
     if (meta.seqType != dbtype)
         return "seqType";
-    if (Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_NUCLEOTIDES) == false && meta.alphabetSize != par.alphabetSize)
+    if (Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_NUCLEOTIDES) == false && meta.alphabetSize != par.alphabetSize.aminoacids)
         return "alphabetSize";
     if (meta.kmerSize != par.kmerSize)
         return "kmerSize";
diff --git a/src/linclust/MarkovKmerScore.h b/src/linclust/MarkovKmerScore.h
index 5494206..63c6c5b 100644
--- a/src/linclust/MarkovKmerScore.h
+++ b/src/linclust/MarkovKmerScore.h
@@ -71,7 +71,7 @@ class MarkovKmerScore{
 
 
 
-    static float scoreKmer(const int * kmer, int kmerSize){
+    static float scoreKmer(const unsigned char * kmer, unsigned char kmerSize){
         float totalSocore = 0.0;
         for(int pos = 0; pos < kmerSize - MarkovScores::MARKOV_ORDER; pos++){
             size_t lookupIdx = Indexer::computeKmerIdx(&kmer[pos], MarkovScores::MARKOV_ORDER+1);
@@ -80,7 +80,7 @@ class MarkovKmerScore{
         return totalSocore;
     }
 
-    static int adjustedLength(const int * kmer, int kmerSize, float minScoreThr){
+    static int adjustedLength(const unsigned char * kmer, unsigned char kmerSize, float minScoreThr){
         float totalSocore = 0.0;
         int pos = 0;
         while(totalSocore < minScoreThr && pos < kmerSize - MarkovScores::MARKOV_ORDER){
diff --git a/src/linclust/kmerindexdb.cpp b/src/linclust/kmerindexdb.cpp
index 5165e0d..d11030d 100644
--- a/src/linclust/kmerindexdb.cpp
+++ b/src/linclust/kmerindexdb.cpp
@@ -9,6 +9,10 @@
 #include "KmerIndex.h"
 #include "kmersearch.h"
 
+#ifndef SIZE_T_MAX
+#define SIZE_T_MAX ((size_t) -1)
+#endif
+
 extern const char* version;
 
 int kmerindexdb(int argc, const char **argv, const Command &command) {
@@ -57,17 +61,15 @@ int kmerindexdb(int argc, const char **argv, const Command &command) {
     if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) {
         subMat = new NucleotideMatrix(par.seedScoringMatrixFile.nucleotides, 1.0, 0.0);
     }else {
-        if (par.alphabetSize == 21) {
+        if (par.alphabetSize.aminoacids == 21) {
             subMat = new SubstitutionMatrix(par.seedScoringMatrixFile.aminoacids, 2.0, 0.0);
         } else {
             SubstitutionMatrix sMat(par.seedScoringMatrixFile.aminoacids, 2.0, 0.0);
-            subMat = new ReducedMatrix(sMat.probMatrix, sMat.subMatrixPseudoCounts, sMat.aa2int, sMat.int2aa, sMat.alphabetSize, par.alphabetSize, 2.0);
+            subMat = new ReducedMatrix(sMat.probMatrix, sMat.subMatrixPseudoCounts, sMat.aa2num, sMat.num2aa, sMat.alphabetSize, par.alphabetSize.aminoacids, 2.0);
         }
     }
 
     //seqDbr.readMmapedDataInMemory();
-    const size_t KMER_SIZE = par.kmerSize;
-    size_t chooseTopKmer = par.kmersPerSequence;
 
     // memoryLimit in bytes
     size_t memoryLimit;
@@ -77,24 +79,25 @@ int kmerindexdb(int argc, const char **argv, const Command &command) {
         memoryLimit = static_cast<size_t>(Util::getTotalSystemMemory() * 0.9);
     }
     Debug(Debug::INFO) << "\n";
-    size_t totalKmers = computeKmerCount(seqDbr, KMER_SIZE, chooseTopKmer);
+
+    float kmersPerSequenceScale = (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) ?
+                                  par.kmersPerSequenceScale.nucleotides : par.kmersPerSequenceScale.aminoacids;
+    size_t totalKmers = computeKmerCount(seqDbr, par.kmerSize, par.kmersPerSequence, kmersPerSequenceScale);
+    totalKmers *= par.pickNbest;
     size_t totalSizeNeeded = computeMemoryNeededLinearfilter<short>(totalKmers);
-    Debug(Debug::INFO) << "Estimated memory consumption " << totalSizeNeeded/1024/1024 << " MB\n";
     // compute splits
     size_t splits = static_cast<size_t>(std::ceil(static_cast<float>(totalSizeNeeded) / memoryLimit));
-//    size_t splits = 2;
-    if (splits > 1) {
-        // security buffer
-        splits += 1;
-    }
+    size_t totalKmersPerSplit = std::max(static_cast<size_t>(1024+1),
+                                         static_cast<size_t>(std::min(totalSizeNeeded, memoryLimit)/sizeof(KmerPosition<short>))+1);
+    std::vector<std::pair<size_t, size_t>> hashRanges = setupKmerSplits<short>(par, subMat, seqDbr, totalKmersPerSplit, splits);
 
-    Debug(Debug::INFO) << "Process file into " << splits << " parts\n";
+    Debug(Debug::INFO) << "Process file into " << hashRanges.size() << " parts\n";
     std::vector<std::string> splitFiles;
     KmerPosition<short> *hashSeqPair = NULL;
 
     size_t writePos = 0;
     size_t mpiRank = 0;
-    size_t adjustedKmerSize = KMER_SIZE;
+    size_t adjustedKmerSize = par.kmerSize;
 #ifdef HAVE_MPI
     splits = std::max(static_cast<size_t>(MMseqsMPI::numProc), splits);
     size_t fromSplit = 0;
@@ -116,7 +119,10 @@ int kmerindexdb(int argc, const char **argv, const Command &command) {
     for(size_t split = fromSplit; split < fromSplit+splitCount; split++) {
         std::string splitFileName = par.db2 + "_split_" +SSTR(split);
         size_t splitKmerCount = (splits > 1) ? static_cast<size_t >(static_cast<double>(totalKmers/splits) * 1.2) : totalKmers;
-        KmerSearch::ExtractKmerAndSortResult kmerRet  = KmerSearch::extractKmerAndSort(splitKmerCount, split, splits, seqDbr, par, subMat, KMER_SIZE, chooseTopKmer, 1, par.adjustKmerLength);
+        int range=MathUtil::ceilIntDivision(USHRT_MAX+1, static_cast<int>(splits));
+        size_t rangeFrom = split*range;
+        size_t rangeTo = (splits == 1) ? SIZE_T_MAX : splits*range+range;
+        KmerSearch::ExtractKmerAndSortResult kmerRet = KmerSearch::extractKmerAndSort(splitKmerCount, rangeFrom, rangeTo, seqDbr, par, subMat);
         hashSeqPair = kmerRet.kmers;
         // assign rep. sequence to same kmer members
         // The longest sequence is the first since we sorted by kmer, seq.Len and id
@@ -136,18 +142,20 @@ int kmerindexdb(int argc, const char **argv, const Command &command) {
         }
     }
 #else
-    for(size_t split = 0; split < splits; split++) {
+    for(size_t split = 0; split < hashRanges.size(); split++) {
+        Debug(Debug::INFO) << "Generate k-mers list " << split <<"\n";
+
         std::string splitFileName = par.db2 + "_split_" +SSTR(split);
-        size_t splitKmerCount = (splits > 1) ? static_cast<size_t >(static_cast<double>(totalKmers/splits) * 1.2) : totalKmers;
-        KmerSearch::ExtractKmerAndSortResult kmerRet = KmerSearch::extractKmerAndSort(splitKmerCount, split, splits, seqDbr, par, subMat, KMER_SIZE, chooseTopKmer, 1, par.adjustKmerLength);
+
+        KmerSearch::ExtractKmerAndSortResult kmerRet = KmerSearch::extractKmerAndSort(totalKmersPerSplit, hashRanges[split].first, hashRanges[split].second, seqDbr, par, subMat);
         hashSeqPair = kmerRet.kmers;
         adjustedKmerSize = std::max(adjustedKmerSize, kmerRet.adjustedKmer);
         // assign rep. sequence to same kmer members
         // The longest sequence is the first since we sorted by kmer, seq.Len and id
         if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)){
-            writePos = LinsearchIndexReader::pickCenterKmer<Parameters::DBTYPE_NUCLEOTIDES>(hashSeqPair, splitKmerCount);
+            writePos = LinsearchIndexReader::pickCenterKmer<Parameters::DBTYPE_NUCLEOTIDES>(hashSeqPair, totalKmersPerSplit);
         }else{
-            writePos = LinsearchIndexReader::pickCenterKmer<Parameters::DBTYPE_AMINO_ACIDS>(hashSeqPair, splitKmerCount);
+            writePos = LinsearchIndexReader::pickCenterKmer<Parameters::DBTYPE_AMINO_ACIDS>(hashSeqPair, totalKmersPerSplit);
         }
 
         if(splits > 1){
@@ -177,7 +185,7 @@ int kmerindexdb(int argc, const char **argv, const Command &command) {
         const int seqType = seqDbr.getDbtype();
         const int srcSeqType = FileUtil::parseDbType(par.db2.c_str());
         // Reuse the compBiasCorr field to store the adjustedKmerSize, It is not needed in the linsearch
-        int metadata[] = {static_cast<int>(par.maxSeqLen), static_cast<int>(KMER_SIZE), static_cast<int>(adjustedKmerSize), subMat->alphabetSize, mask, spacedKmer, 0, seqType, srcSeqType, headers1, headers2};
+        int metadata[] = {static_cast<int>(par.maxSeqLen), static_cast<int>(par.kmerSize), static_cast<int>(adjustedKmerSize), subMat->alphabetSize, mask, spacedKmer, 0, seqType, srcSeqType, headers1, headers2};
         char *metadataptr = (char *) &metadata;
         dbw.writeData(metadataptr, sizeof(metadata), PrefilteringIndexReader::META, 0);
         dbw.alignToPageSize();
diff --git a/src/linclust/kmermatcher.cpp b/src/linclust/kmermatcher.cpp
index d9a40f0..3303747 100644
--- a/src/linclust/kmermatcher.cpp
+++ b/src/linclust/kmermatcher.cpp
@@ -19,7 +19,7 @@
 #include "ExtendedSubstitutionMatrix.h"
 #include "KmerGenerator.h"
 #include "MarkovKmerScore.h"
-
+#include "xxhash.h"
 #include <limits>
 #include <string>
 #include <vector>
@@ -27,48 +27,24 @@
 #include <algorithm>
 #include <sys/mman.h>
 #include <fcntl.h>
+#include <sys/stat.h>
+
 
 #ifdef OPENMP
 #include <omp.h>
 #endif
-
 #ifndef SIZE_T_MAX
 #define SIZE_T_MAX ((size_t) -1)
 #endif
 
-#define RoL(val, numbits) (val << numbits) ^ (val >> (32 - numbits))
-
-unsigned circ_hash(const int * x, unsigned length, const unsigned rol){
-    short unsigned RAND[21] = {0x4567, 0x23c6, 0x9869, 0x4873, 0xdc51, 0x5cff, 0x944a, 0x58ec, 0x1f29, 0x7ccd, 0x58ba, 0xd7ab, 0x41f2, 0x1efb, 0xa9e3, 0xe146, 0x007c, 0x62c2, 0x0854, 0x27f8, 0x231b};
-    short unsigned h = 0x0;
-    h = h ^ RAND[x[0]];                  // XOR h and ki
-    for (unsigned int i = 1; i < length; ++i){
-        h = RoL(h, rol);
-        h ^= RAND[x[i]];                   // XOR h and ki
-    }
-    return h;
-}
-
-// Rolling hash for CRC variant: compute hash value for next key x[0:length-1] from previous hash value hash( x[-1:length-2] ) and x_first = x[-1]
-unsigned circ_hash_next(const int * x, unsigned length, int x_first, short unsigned h, const unsigned rol){
-    short unsigned RAND[21] = {0x4567, 0x23c6, 0x9869, 0x4873, 0xdc51, 0x5cff, 0x944a, 0x58ec, 0x1f29, 0x7ccd, 0x58ba, 0xd7ab, 0x41f2, 0x1efb, 0xa9e3, 0xe146, 0x007c, 0x62c2, 0x0854, 0x27f8, 0x231b};
-    h ^= RoL(RAND[x_first], (5*(length-1)) % 16); // undo INITIAL_VALUE and first letter x[0] of old key
-    h =  RoL(h, rol); // circularly permute all letters x[1:length-1] to 5 positions to left
-    h ^= RAND[x[length-1]]; // add new, last letter of new key x[1:length]
-    return h;
-}
-#undef RoL
-
-
 template <typename T>
 KmerPosition<T> *initKmerPositionMemory(size_t size) {
     KmerPosition<T> * hashSeqPair = new(std::nothrow) KmerPosition<T>[size + 1];
     Util::checkAllocation(hashSeqPair, "Can not allocate memory");
     size_t pageSize = Util::getPageSize()/sizeof(KmerPosition<T>);
-
 #pragma omp parallel
     {
-#pragma omp for schedule(dynamic, 1)
+#pragma omp for schedule(static)
         for (size_t page = 0; page < size+1; page += pageSize) {
             size_t readUntil = std::min(size+1, page + pageSize) - page;
             memset(hashSeqPair+page, 0xFF, sizeof(KmerPosition<T>)* readUntil);
@@ -77,59 +53,39 @@ KmerPosition<T> *initKmerPositionMemory(size_t size) {
     return hashSeqPair;
 }
 
+void maskSequence(int maskMode, int maskLowerCase, Sequence &seq, int maskLetter, ProbabilityMatrix * probMatrix){
+    if (maskMode == 1) {
+        tantan::maskSequences((char*)seq.numSequence,
+                              (char*)(seq.numSequence + seq.L),
+                              50 /*options.maxCycleLength*/,
+                              probMatrix->probMatrixPointers,
+                              0.005 /*options.repeatProb*/,
+                              0.05 /*options.repeatEndProb*/,
+                              0.5 /*options.repeatOffsetProbDecay*/,
+                              0, 0,
+                              0.9 /*options.minMaskProb*/, probMatrix->hardMaskTable);
+    }
+    if(maskLowerCase == 1 && (Parameters::isEqualDbtype(seq.getSequenceType(), Parameters::DBTYPE_AMINO_ACIDS) ||
+                                      Parameters::isEqualDbtype(seq.getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES))) {
+        const char * charSeq = seq.getSeqData();
+        for (int i = 0; i < seq.L; i++) {
+            seq.numSequence[i] = (islower(charSeq[i])) ? maskLetter : seq.numSequence[i];
+        }
+    }
+}
+
 template <int TYPE, typename T>
-std::pair<size_t, size_t> fillKmerPositionArray(KmerPosition<T> * hashSeqPair, DBReader<unsigned int> &seqDbr,
-                             Parameters & par, BaseMatrix * subMat,
-                             const size_t KMER_SIZE, size_t chooseTopKmer,
-                             bool includeIdenticalKmer, size_t splits,
-                             size_t split, size_t pickNBest, bool adjustLength, float chooseTopKmerScale){
+std::pair<size_t, size_t> fillKmerPositionArray(KmerPosition<T> * kmerArray, size_t kmerArraySize, DBReader<unsigned int> &seqDbr,
+                                                Parameters & par, BaseMatrix * subMat, bool hashWholeSequence,
+                                                size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution){
     size_t offset = 0;
     int querySeqType  =  seqDbr.getDbtype();
-    size_t longestKmer = KMER_SIZE;
+    size_t longestKmer = par.kmerSize;
     ProbabilityMatrix *probMatrix = NULL;
     if (par.maskMode == 1) {
         probMatrix = new ProbabilityMatrix(*subMat);
     }
 
-    struct SequencePosition{
-        short score;
-        size_t kmer;
-        unsigned int pos;
-        static bool compareByScore(const SequencePosition &first, const SequencePosition &second){
-            if(first.score < second.score)
-                return true;
-            if(second.score < first.score)
-                return false;
-            if(first.kmer < second.kmer)
-                return true;
-            if(second.kmer < first.kmer)
-                return false;
-            if(first.pos < second.pos)
-                return true;
-            if(second.pos < first.pos)
-                return false;
-            return false;
-        }
-        static bool compareByScoreReverse(const SequencePosition &first, const SequencePosition &second){
-            if(first.score < second.score)
-                return true;
-            if(second.score < first.score)
-                return false;
-
-            size_t firstKmer  = BIT_SET(first.kmer, 63);
-            size_t secondKmer = BIT_SET(second.kmer, 63);
-            if(firstKmer < secondKmer)
-                return true;
-            if(secondKmer < firstKmer)
-                return false;
-            if(first.pos < second.pos)
-                return true;
-            if(second.pos < first.pos)
-                return false;
-            return false;
-        }
-    };
-
     ScoreMatrix two;
     ScoreMatrix three;
     if (TYPE == Parameters::DBTYPE_HMM_PROFILE) {
@@ -144,24 +100,22 @@ std::pair<size_t, size_t> fillKmerPositionArray(KmerPosition<T> * hashSeqPair, D
 #ifdef OPENMP
         thread_idx = static_cast<unsigned int>(omp_get_thread_num());
 #endif
-        const size_t adjustedKmerSize = (adjustLength) ? std::min(KMER_SIZE+5, static_cast<size_t >(23)) :  KMER_SIZE;
-        Sequence seq(par.maxSeqLen, querySeqType, subMat, adjustedKmerSize, false, false);
+        unsigned short * scoreDist= new unsigned short[65536];
+        unsigned int * hierarchicalScoreDist= new unsigned int[128];
+
+        const int adjustedKmerSize = (par.adjustKmerLength) ? std::min( par.kmerSize+5, 23) :   par.kmerSize;
+        Sequence seq(par.maxSeqLen, querySeqType, subMat, adjustedKmerSize, par.spacedKmer, false, true, par.spacedKmerPattern);
         KmerGenerator* generator;
         if (TYPE == Parameters::DBTYPE_HMM_PROFILE) {
-            generator = new KmerGenerator(KMER_SIZE, subMat->alphabetSize, 150);
+            generator = new KmerGenerator( par.kmerSize, subMat->alphabetSize, 150);
             generator->setDivideStrategy(&three, &two);
         }
-        Indexer idxer(subMat->alphabetSize - 1, KMER_SIZE);
-        char * charSequence = new char[par.maxSeqLen + 1];
+        Indexer idxer(subMat->alphabetSize - 1,  par.kmerSize);
         const unsigned int BUFFER_SIZE = 1024;
         size_t bufferPos = 0;
         KmerPosition<T> * threadKmerBuffer = new KmerPosition<T>[BUFFER_SIZE];
-        SequencePosition * kmers = new SequencePosition[(pickNBest * (par.maxSeqLen + 1)) + 1];
-        int highestSeq[32];
-        for(size_t i = 0; i< KMER_SIZE; i++){
-            highestSeq[i]=subMat->alphabetSize-1;
-        }
-        size_t highestPossibleIndex = idxer.int2index(highestSeq);
+        SequencePosition * kmers = (SequencePosition *) malloc((par.pickNbest * (par.maxSeqLen + 1) + 1) * sizeof(SequencePosition));
+        size_t kmersArraySize = par.maxSeqLen;
         const size_t flushSize = 100000000;
         size_t iterations = static_cast<size_t>(ceil(static_cast<double>(seqDbr.getSize()) / static_cast<double>(flushSize)));
         for (size_t i = 0; i < iterations; i++) {
@@ -171,197 +125,214 @@ std::pair<size_t, size_t> fillKmerPositionArray(KmerPosition<T> * hashSeqPair, D
 #pragma omp for schedule(dynamic, 10)
             for (size_t id = start; id < (start + bucketSize); id++) {
                 progress.updateProgress();
+                memset(scoreDist, 0, sizeof(unsigned short) * 65536);
+                memset(hierarchicalScoreDist, 0, sizeof(unsigned int) * 128);
+
                 seq.mapSequence(id, seqDbr.getDbKey(id), seqDbr.getData(id, thread_idx), seqDbr.getSeqLen(id));
+
                 size_t seqHash =  SIZE_T_MAX;
-                if(includeIdenticalKmer){
-                    seqHash = highestPossibleIndex + static_cast<unsigned int>(Util::hash(seq.int_sequence, seq.L));
-                }
-                // mask using tantan
-                if (par.maskMode == 1) {
-                    for (int i = 0; i < seq.L; i++) {
-                        charSequence[i] = (char) seq.int_sequence[i];
-                    }
-                    tantan::maskSequences(charSequence,
-                                          charSequence + seq.L,
-                                          50 /*options.maxCycleLength*/,
-                                          probMatrix->probMatrixPointers,
-                                          0.005 /*options.repeatProb*/,
-                                          0.05 /*options.repeatEndProb*/,
-                                          0.5 /*options.repeatOffsetProbDecay*/,
-                                          0, 0,
-                                          0.9 /*options.minMaskProb*/, probMatrix->hardMaskTable);
-                    for (int i = 0; i < seq.L; i++) {
-                        seq.int_sequence[i] = charSequence[i];
-                    }
-                }
-                if(par.maskLowerCaseMode == 1 && (Parameters::isEqualDbtype(seq.getSequenceType(), Parameters::DBTYPE_AMINO_ACIDS) ||
-                                                  Parameters::isEqualDbtype(seq.getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES))) {
-                    const char * charSeq = seq.getSeqData();
-                    int maskLetter = subMat->aa2int[(int)'X'];
-                    for (int i = 0; i < seq.L; i++) {
-                        seq.int_sequence[i] = (islower(charSeq[i])) ? maskLetter : seq.int_sequence[i];
-                    }
+                //TODO, how to handle this in reverse?
+                if(hashWholeSequence){
+                    seqHash = Util::hash(seq.numSequence, seq.L);
+                    seqHash = XXH64(&seqHash, sizeof(size_t), par.hashShift);
                 }
 
+                maskSequence(par.maskMode, par.maskLowerCaseMode, seq, subMat->aa2num[static_cast<int>('X')], probMatrix);
 
-                int seqKmerCount = 0;
+                size_t seqKmerCount = 0;
                 unsigned int seqId = seq.getDbKey();
-                unsigned short prevHash = 0;
-                unsigned int prevFirstRes = 0;
-                if (seq.hasNextKmer()) {
-                    const int *kmer = seq.nextKmer();
-                    prevHash = circ_hash(kmer, KMER_SIZE, par.hashShift);
-                    prevFirstRes = kmer[0];
-                }
                 while (seq.hasNextKmer()) {
-                    int *kmer = (int*) seq.nextKmer();
-                    if(TYPE != Parameters::DBTYPE_NUCLEOTIDES){
-                        prevHash = circ_hash_next(kmer, KMER_SIZE, prevFirstRes, prevHash, par.hashShift);
-                        prevFirstRes = kmer[0];
-                    }
-                    size_t xCount = 0;
-                    for (size_t kpos = 0; kpos < KMER_SIZE; kpos++) {
-                        xCount += (kmer[kpos] == subMat->aa2int[(int) 'X']);
-                    }
-                    if (xCount > 0) {
+                    unsigned char *kmer = (unsigned char*) seq.nextKmer();
+                    if(seq.kmerContainsX()){
                         continue;
                     }
                     if(TYPE == Parameters::DBTYPE_NUCLEOTIDES){
-                        int revKmer[32];
                         NucleotideMatrix * nuclMatrix = (NucleotideMatrix*)subMat;
-                        size_t kmerLen = KMER_SIZE;
-                        int * kmerToHash = kmer;
+                        size_t kmerLen =  par.kmerSize;
                         size_t kmerIdx = Indexer::computeKmerIdx(kmer, kmerLen);
                         size_t revkmerIdx = Util::revComplement(kmerIdx, kmerLen);
+                        // skip forward and rev. identical k-mers.
+                        // We can not know how to align these afterwards
+                        if(revkmerIdx == kmerIdx){
+                            continue;
+                        }
                         bool pickReverseKmer = (revkmerIdx<kmerIdx);
                         kmerIdx = (pickReverseKmer) ? revkmerIdx : kmerIdx;
-                        if(pickReverseKmer){
-                            for(int pos = static_cast<int>(adjustedKmerSize)-1; pos > -1; pos--){
-                                revKmer[(adjustedKmerSize - 1) - pos]=nuclMatrix->reverseResidue(kmer[pos]);
+
+                        const unsigned short hash = static_cast<unsigned short>(XXH64(&kmerIdx, 8, par.hashShift));
+
+                        if(par.adjustKmerLength) {
+                            unsigned char revKmer[32];
+                            unsigned char * kmerToHash = kmer;
+                            if(pickReverseKmer){
+                                for(int pos = static_cast<int>(adjustedKmerSize)-1; pos > -1; pos--){
+                                    revKmer[(adjustedKmerSize - 1) - pos]=nuclMatrix->reverseResidue(kmer[pos]);
+                                }
+                                kmerToHash = revKmer;
                             }
-                            kmerToHash = revKmer;
-                        }
-                        prevHash = circ_hash(kmerToHash, kmerLen, par.hashShift);
-                        if(adjustLength) {
                             kmerLen = MarkovKmerScore::adjustedLength(kmerToHash, adjustedKmerSize,
-                                                                      (KMER_SIZE - MarkovScores::MARKOV_ORDER) * MarkovScores::MEDIAN_SCORE);
+                                                                      (par.kmerSize - MarkovScores::MARKOV_ORDER) * MarkovScores::MEDIAN_SCORE);
                             longestKmer = std::max(kmerLen, longestKmer);
                             kmerIdx = Indexer::computeKmerIdx(kmerToHash, kmerLen);
                         }
 
-//                        std::cout << "\t" << MarkovKmerScore::scoreKmer(kmer, KMER_SIZE) << std::endl;
-//                        float score = MarkovKmerScore::scoreKmer(kmer, KMER_SIZE);
-//                        sum += score;
-//                        std::cout << score << "\t";
-//                        std::cout << len << "\t";
-//                        size_t adjkmerIdx = Indexer::computeKmerIdx(kmer, len);
-//                        Indexer::printKmer(kmerIdx, KMER_SIZE);
-//                        std::cout << "\t";
-//                        Indexer::printKmer(adjkmerIdx, len);
-//                        std::cout << std::endl;
-                        //std::cout << len << std::endl;
-
-
                         // set signed bit for normal kmers to make the  SIZE_T_MAX logic easier
                         // reversed kmers do not have a signed bit
                         size_t kmerRev = (pickReverseKmer) ? BIT_CLEAR(kmerIdx, 63) : BIT_SET(kmerIdx, 63);
                         (kmers + seqKmerCount)->kmer = kmerRev;
                         int pos = seq.getCurrentPosition();
                         (kmers + seqKmerCount)->pos = (pickReverseKmer) ? (seq.L) - pos - kmerLen : pos;
-//                        std::cout << seq.getDbKey() << "\t";
-//                        std::cout << pickReverseKmer << "\t";
-//                        std::cout << seq.L << "\t";sta
-//                        std::cout << pos << "\t";
-//                        std::cout << (kmers + seqKmerCount)->pos << "\t";
-//                        printKmer(kmerIdx, KMER_SIZE);
-//                        std::cout <<  "\t";
-//                         printKmer(kmerRev, KMER_SIZE);
-//                        std::cout <<  "\n";
-                        (kmers + seqKmerCount)->score = prevHash;
+                        (kmers + seqKmerCount)->score = hash;
+                        scoreDist[hash]++;
+                        hierarchicalScoreDist[hash >> 9]++;
                         seqKmerCount++;
                     } else if(TYPE == Parameters::DBTYPE_HMM_PROFILE) {
                         std::pair<size_t*, size_t>  scoreMat = generator->generateKmerList(kmer, true);
 //                        std::cout << scoreMat.elementSize << std::endl;
-                        for(size_t kmerPos = 0; kmerPos < scoreMat.second && kmerPos < pickNBest; kmerPos++){
+                        for(size_t kmerPos = 0; kmerPos < scoreMat.second && kmerPos < static_cast<size_t >(par.pickNbest); kmerPos++){
                             (kmers + seqKmerCount)->kmer  =  scoreMat.first[kmerPos];
                             (kmers + seqKmerCount)->pos = seq.getCurrentPosition();
-                            //TODO
-                            prevHash = circ_hash(kmer, KMER_SIZE, par.hashShift);
-                            (kmers + seqKmerCount)->score = prevHash;
+                            const unsigned short hash = static_cast<unsigned short>(XXH64(&(kmers + seqKmerCount)->kmer, 8, par.hashShift));
+                            (kmers + seqKmerCount)->score = hash;
+                            scoreDist[hash]++;
+                            hierarchicalScoreDist[hash >> 9]++;
                             seqKmerCount++;
                         }
                     } else {
-                        uint64_t kmerIdx = idxer.int2index(kmer, 0, KMER_SIZE);
+                        uint64_t kmerIdx = idxer.int2index(kmer, 0, par.kmerSize);
                         (kmers + seqKmerCount)->kmer = kmerIdx;
                         (kmers + seqKmerCount)->pos = seq.getCurrentPosition();
-                        (kmers + seqKmerCount)->score = prevHash;
+                        const unsigned short hash = static_cast<unsigned short>(XXH64(&kmerIdx, sizeof(kmerIdx), par.hashShift));
+//                        (kmers + seqKmerCount)->score = hash;
+//                        const unsigned short hash = circ_hash(kmer, par.kmerSize, 5);
+                        (kmers + seqKmerCount)->score = hash;
+                        scoreDist[hash]++;
+                        hierarchicalScoreDist[hash >> 9]++;
+//                        std::cout << seqId << "\t" << (kmers + seqKmerCount)->score << "\t" << (kmers + seqKmerCount)->pos << std::endl;
+
                         seqKmerCount++;
                     }
+                    if(seqKmerCount >= kmersArraySize){
+                        kmersArraySize = seq.getMaxLen();
+                        kmers = (SequencePosition *) realloc(kmers, (par.pickNbest * (kmersArraySize + 1) + 1) * sizeof(SequencePosition));
+                    }
 
                 }
-
-                if (seqKmerCount > 1) {
-                    if(TYPE == Parameters::DBTYPE_NUCLEOTIDES) {
-                        std::sort(kmers, kmers + seqKmerCount, SequencePosition::compareByScoreReverse);
-                    }else{
-                        std::sort(kmers, kmers + seqKmerCount, SequencePosition::compareByScore);
+                float kmersPerSequenceScale = (TYPE == Parameters::DBTYPE_NUCLEOTIDES) ? par.kmersPerSequenceScale.nucleotides
+                                                                                       : par.kmersPerSequenceScale.aminoacids;
+                size_t kmerConsidered = std::min(static_cast<size_t >(par.kmersPerSequence  - 1 + (kmersPerSequenceScale * seq.L)), seqKmerCount);
+
+                unsigned int threshold = 0;
+                size_t kmerInBins = 0;
+                if (seqKmerCount > 0) {
+                    size_t hierarchicaThreshold = 0;
+                    for(hierarchicaThreshold = 0; hierarchicaThreshold < 128 && kmerInBins < kmerConsidered; hierarchicaThreshold++){
+                        kmerInBins += hierarchicalScoreDist[hierarchicaThreshold];
+                    }
+                    hierarchicaThreshold -= (hierarchicaThreshold > 0) ? 1: 0;
+                    kmerInBins -= hierarchicalScoreDist[hierarchicaThreshold];
+                    for(threshold = hierarchicaThreshold*512; threshold <= USHRT_MAX && kmerInBins < kmerConsidered; threshold++){
+                        kmerInBins += scoreDist[threshold];
                     }
                 }
+                int tooMuchElemInLastBin = (kmerInBins - kmerConsidered);
 
                 // add k-mer to represent the identity
-                //TODO, how to handle this in reverse?
-                if (seqHash%splits == split) {
+                if (static_cast<unsigned short>(seqHash) >= hashStartRange && static_cast<unsigned short>(seqHash) <= hashEndRange) {
                     threadKmerBuffer[bufferPos].kmer = seqHash;
                     threadKmerBuffer[bufferPos].id = seqId;
                     threadKmerBuffer[bufferPos].pos = 0;
                     threadKmerBuffer[bufferPos].seqLen = seq.L;
+                    if(hashDistribution != NULL){
+                        __sync_fetch_and_add(&hashDistribution[static_cast<unsigned short>(seqHash)], 1);
+                    }
                     bufferPos++;
                     if (bufferPos >= BUFFER_SIZE) {
                         size_t writeOffset = __sync_fetch_and_add(&offset, bufferPos);
-                        memcpy(hashSeqPair + writeOffset, threadKmerBuffer, sizeof(KmerPosition<T>) * bufferPos);
+                        if(writeOffset + bufferPos < kmerArraySize){
+                            if(kmerArray!=NULL){
+                                memcpy(kmerArray + writeOffset, threadKmerBuffer, sizeof(KmerPosition<T>) * bufferPos);
+                            }
+                        } else{
+                            Debug(Debug::ERROR) << "Kmer array overflow. currKmerArrayOffset="<< writeOffset
+                                                << ", kmerBufferPos=" << bufferPos
+                                                << ", kmerArraySize=" << kmerArraySize <<".\n";
+                            EXIT(EXIT_FAILURE);
+                        }
                         bufferPos = 0;
                     }
                 }
 
-                size_t kmersToConsider = std::min(static_cast<int>(chooseTopKmer - 1 + (chooseTopKmerScale * seq.L)), seqKmerCount);
-                size_t kmersConsidered = 0;
-
-                size_t prevKmer = SIZE_T_MAX;
-                kmers[seqKmerCount].kmer = SIZE_T_MAX;
-                for (size_t topKmer = 0; topKmer < static_cast<size_t>(seqKmerCount) &&
-                                         kmersConsidered < kmersToConsider; topKmer++) {
-
-                    size_t kmerCurr = (kmers + topKmer)->kmer;
-                    size_t kmerNext = (kmers + topKmer + 1)->kmer;
-
-                    if (TYPE == Parameters::DBTYPE_NUCLEOTIDES) {
-                        kmerCurr = BIT_SET(kmerCurr, 63);
-                        kmerNext = BIT_SET(kmerNext, 63);
+                if(par.ignoreMultiKmer){
+                    if(TYPE == Parameters::DBTYPE_NUCLEOTIDES) {
+                        std::sort(kmers, kmers + seqKmerCount, SequencePosition::compareByScoreReverse);
+                    }else{
+                        std::sort(kmers, kmers + seqKmerCount, SequencePosition::compareByScore);
+                    }
+                }
+                size_t selectedKmer = 0;
+                for (size_t kmerIdx = 0; kmerIdx < seqKmerCount && selectedKmer < kmerConsidered; kmerIdx++) {
+                    size_t kmer = (kmers + kmerIdx)->kmer;
+                    if(TYPE == Parameters::DBTYPE_NUCLEOTIDES) {
+                        kmer = BIT_SET(kmer, 63);
                     }
-
-                    bool repeatedKmer = (kmerCurr == kmerNext || kmerCurr == prevKmer);
-                    prevKmer = kmerCurr;
-
                     /* skip repeated kmer */
-                    if (par.ignoreMultiKmer > 0 && repeatedKmer) {
-                        continue;
+                    if (par.ignoreMultiKmer) {
+                        if(kmerIdx + 1 < seqKmerCount){
+                            size_t nextKmer = (kmers + kmerIdx + 1)->kmer;
+                            if(TYPE == Parameters::DBTYPE_NUCLEOTIDES) {
+                                nextKmer = BIT_SET(nextKmer, 63);
+                            }
+                            if(kmer == nextKmer){
+                                kmerIdx++;
+                                continue;
+                            }
+                        }
                     }
 
-                    kmersConsidered++;
-                    size_t splitIdx = kmerCurr % splits;
-                    if (splitIdx != split) {
-                        continue;
-                    }
+                    if ((kmers + kmerIdx)->score < threshold ){
+                        // this if is needed to avoid extracting too much elements in the last bin
+                        if((kmers + kmerIdx)->score == (threshold - 1) && tooMuchElemInLastBin){
+                            tooMuchElemInLastBin--;
+                            threshold -= (tooMuchElemInLastBin == 0) ? 1 : 0;
+                        }
+//                        std::cout << seqId << "\t" << (kmers + kmerIdx)->score << "\t" << (kmers + kmerIdx)->pos << std::endl;
+
+                        selectedKmer++;
+                        if ((kmers + kmerIdx)->score >= hashStartRange && (kmers + kmerIdx)->score <= hashEndRange)
+                        {
+//                            {
+//                                size_t tmpKmerIdx= (kmers + kmerIdx)->kmer;
+//                                tmpKmerIdx=BIT_CLEAR(tmpKmerIdx, 63);
+//                                std::cout << seqId << "\t" << (kmers + kmerIdx)->score << "\t" << tmpKmerIdx << std::endl;
+//                            }
+                            threadKmerBuffer[bufferPos].kmer = (kmers + kmerIdx)->kmer;
+                            threadKmerBuffer[bufferPos].id = seqId;
+                            threadKmerBuffer[bufferPos].pos = (kmers + kmerIdx)->pos;
+                            threadKmerBuffer[bufferPos].seqLen = seq.L;
+                            bufferPos++;
+                            if(hashDistribution != NULL){
+                                __sync_fetch_and_add(&hashDistribution[(kmers + kmerIdx)->score], 1);
+                            }
 
-                    threadKmerBuffer[bufferPos].kmer = (kmers + topKmer)->kmer;
-                    threadKmerBuffer[bufferPos].id = seqId;
-                    threadKmerBuffer[bufferPos].pos = (kmers + topKmer)->pos;
-                    threadKmerBuffer[bufferPos].seqLen = seq.L;
-                    bufferPos++;
-                    if (bufferPos >= BUFFER_SIZE) {
-                        size_t writeOffset = __sync_fetch_and_add(&offset, bufferPos);
-                        memcpy(hashSeqPair + writeOffset, threadKmerBuffer, sizeof(KmerPosition<T>) * bufferPos);
-                        bufferPos = 0;
+                            if (bufferPos >= BUFFER_SIZE) {
+                                size_t writeOffset = __sync_fetch_and_add(&offset, bufferPos);
+                                if(writeOffset + bufferPos < kmerArraySize){
+                                    if(kmerArray!=NULL) {
+                                        memcpy(kmerArray + writeOffset, threadKmerBuffer,
+                                               sizeof(KmerPosition<T>) * bufferPos);
+                                    }
+                                } else{
+                                    Debug(Debug::ERROR) << "Kmer array overflow. currKmerArrayOffset="<< writeOffset
+                                                        << ", kmerBufferPos=" << bufferPos
+                                                        << ", kmerArraySize=" << kmerArraySize <<".\n";
+
+                                    EXIT(EXIT_FAILURE);
+                                }
+
+                                bufferPos = 0;
+                            }
+                        }
                     }
                 }
             }
@@ -376,13 +347,17 @@ std::pair<size_t, size_t> fillKmerPositionArray(KmerPosition<T> * hashSeqPair, D
 #pragma omp barrier
         }
 
+
         if(bufferPos > 0){
             size_t writeOffset = __sync_fetch_and_add(&offset, bufferPos);
-            memcpy(hashSeqPair+writeOffset, threadKmerBuffer, sizeof(KmerPosition<T>) * bufferPos);
+            if(kmerArray != NULL){
+                memcpy(kmerArray+writeOffset, threadKmerBuffer, sizeof(KmerPosition<T>) * bufferPos);
+            }
         }
-        delete[] kmers;
-        delete[] charSequence;
+        free(kmers);
         delete[] threadKmerBuffer;
+        delete[] hierarchicalScoreDist;
+        delete[] scoreDist;
         if (TYPE == Parameters::DBTYPE_HMM_PROFILE) {
             delete generator;
         }
@@ -400,36 +375,21 @@ std::pair<size_t, size_t> fillKmerPositionArray(KmerPosition<T> * hashSeqPair, D
 }
 
 template <typename T>
-KmerPosition<T> * doComputation(size_t totalKmers, size_t split, size_t splits, std::string splitFile,
-                             DBReader<unsigned int> & seqDbr, Parameters & par, BaseMatrix  * subMat,
-                             size_t KMER_SIZE, size_t chooseTopKmer, bool adjustLength, float chooseTopKmerScale) {
+KmerPosition<T> * doComputation(size_t totalKmers, size_t hashStartRange, size_t hashEndRange, std::string splitFile,
+                                DBReader<unsigned int> & seqDbr, Parameters & par, BaseMatrix  * subMat) {
 
-    Debug(Debug::INFO) << "Generate k-mers list for " << (split+1) <<" split\n";
-
-    size_t splitKmerCount = totalKmers;
-    if(splits > 1){
-        size_t memoryLimit;
-        if (par.splitMemoryLimit > 0) {
-            memoryLimit = static_cast<size_t>(par.splitMemoryLimit);
-        } else {
-            memoryLimit = static_cast<size_t>(Util::getTotalSystemMemory() * 0.9);
-        }
-        // we do not really know how much memory is needed. So this is our best choice
-        splitKmerCount = (memoryLimit / sizeof(KmerPosition<T>));
-    }
-
-    KmerPosition<T> * hashSeqPair = initKmerPositionMemory<T>(splitKmerCount);
+    KmerPosition<T> * hashSeqPair = initKmerPositionMemory<T>(totalKmers);
     size_t elementsToSort;
     if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)){
-        std::pair<size_t, size_t > ret = fillKmerPositionArray<Parameters::DBTYPE_NUCLEOTIDES, T>(hashSeqPair, seqDbr, par, subMat, KMER_SIZE, chooseTopKmer, true, splits, split, 1, adjustLength, chooseTopKmerScale);
+        std::pair<size_t, size_t > ret = fillKmerPositionArray<Parameters::DBTYPE_NUCLEOTIDES, T>(hashSeqPair, totalKmers, seqDbr, par, subMat, true, hashStartRange, hashEndRange, NULL);
         elementsToSort = ret.first;
-        KMER_SIZE = ret.second;
-        Debug(Debug::INFO) << "\nAdjusted k-mer length " << KMER_SIZE << "\n";
+        par.kmerSize = ret.second;
+        Debug(Debug::INFO) << "\nAdjusted k-mer length " << par.kmerSize << "\n";
     }else{
-        std::pair<size_t, size_t > ret = fillKmerPositionArray<Parameters::DBTYPE_AMINO_ACIDS, T>(hashSeqPair, seqDbr, par, subMat, KMER_SIZE, chooseTopKmer, true, splits, split, 1, false, chooseTopKmerScale);
+        std::pair<size_t, size_t > ret = fillKmerPositionArray<Parameters::DBTYPE_AMINO_ACIDS, T>(hashSeqPair, totalKmers, seqDbr, par, subMat, true, hashStartRange, hashEndRange, NULL);
         elementsToSort = ret.first;
     }
-    if(splits == 1){
+    if(hashEndRange == SIZE_T_MAX){
         seqDbr.unmapData();
     }
 
@@ -442,22 +402,13 @@ KmerPosition<T> * doComputation(size_t totalKmers, size_t split, size_t splits,
     }
     Debug(Debug::INFO) << timer.lap() << "\n";
 
-//    {
-//        Indexer indexer(subMat->alphabetSize, KMER_SIZE);
-//        for(size_t i = 0; i < elementsToSort; i++){
-//            indexer.printKmer(hashSeqPair[i].kmer, KMER_SIZE, subMat->int2aa );
-//            Debug(Debug::INFO) << "\t" << hashSeqPair[i].kmer<< "\n";
-//        }
-//    }
-    //kx::radix_sort(hashSeqPair, hashSeqPair + elementsToSort, KmerComparision());
-
     // assign rep. sequence to same kmer members
     // The longest sequence is the first since we sorted by kmer, seq.Len and id
     size_t writePos;
     if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)){
-        writePos = assignGroup<Parameters::DBTYPE_NUCLEOTIDES, T>(hashSeqPair, splitKmerCount, par.includeOnlyExtendable, par.covMode, par.covThr);
+        writePos = assignGroup<Parameters::DBTYPE_NUCLEOTIDES, T>(hashSeqPair, totalKmers, par.includeOnlyExtendable, par.covMode, par.covThr);
     }else{
-        writePos = assignGroup<Parameters::DBTYPE_AMINO_ACIDS, T>(hashSeqPair, splitKmerCount, par.includeOnlyExtendable, par.covMode, par.covThr);
+        writePos = assignGroup<Parameters::DBTYPE_AMINO_ACIDS, T>(hashSeqPair, totalKmers, par.includeOnlyExtendable, par.covMode, par.covThr);
     }
 
     // sort by rep. sequence (stored in kmer) and sequence id
@@ -474,7 +425,7 @@ KmerPosition<T> * doComputation(size_t totalKmers, size_t split, size_t splits,
 //    }
     Debug(Debug::INFO) << timer.lap() << "\n";
 
-    if(splits > 1){
+    if(hashEndRange != SIZE_T_MAX){
         if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)){
             writeKmersToDisk<Parameters::DBTYPE_NUCLEOTIDES, KmerEntryRev, T>(splitFile, hashSeqPair, writePos + 1);
         }else{
@@ -603,11 +554,11 @@ template size_t assignGroup<1, short>(KmerPosition<short> *kmers, size_t splitKm
 template size_t assignGroup<1, int>(KmerPosition<int> *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr);
 
 void setLinearFilterDefault(Parameters *p) {
-    p->spacedKmer = false;
     p->covThr = 0.8;
     p->maskMode = 0;
+    p->spacedKmer = 0;
     p->kmerSize = Parameters::CLUST_LINEAR_DEFAULT_K;
-    p->alphabetSize = Parameters::CLUST_LINEAR_DEFAULT_ALPH_SIZE;
+    p->alphabetSize = MultiParam<int>(Parameters::CLUST_LINEAR_DEFAULT_ALPH_SIZE, 5);
     p->kmersPerSequence = Parameters::CLUST_LINEAR_KMER_PER_SEQ;
 }
 
@@ -617,16 +568,18 @@ size_t computeKmerCount(DBReader<unsigned int> &reader, size_t KMER_SIZE, size_t
     for(size_t id = 0; id < reader.getSize(); id++ ){
         int seqLen = static_cast<int>(reader.getSeqLen(id));
         // we need one for the sequence hash
-        int kmerAdjustedSeqLen = std::max(1, seqLen  - static_cast<int>(KMER_SIZE ) + 1) ;
+        int kmerAdjustedSeqLen = std::max(1, seqLen  - static_cast<int>(KMER_SIZE ) + 2) ;
         totalKmers += std::min(kmerAdjustedSeqLen, static_cast<int>( chooseTopKmer + (chooseTopKmerScale * seqLen)));
     }
     return totalKmers;
 }
+
 template <typename T>
 size_t computeMemoryNeededLinearfilter(size_t totalKmer) {
     return sizeof(KmerPosition<T>) * totalKmer;
 }
 
+
 template <typename T>
 int kmermatcherInner(Parameters& par, DBReader<unsigned int>& seqDbr) {
 
@@ -635,18 +588,15 @@ int kmermatcherInner(Parameters& par, DBReader<unsigned int>& seqDbr) {
     if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) {
         subMat = new NucleotideMatrix(par.scoringMatrixFile.nucleotides, 1.0, 0.0);
     }else {
-        if (par.alphabetSize == 21) {
+        if (par.alphabetSize.aminoacids == 21) {
             subMat = new SubstitutionMatrix(par.scoringMatrixFile.aminoacids, 2.0, 0.0);
         } else {
             SubstitutionMatrix sMat(par.scoringMatrixFile.aminoacids, 8.0, -0.2f);
-            subMat = new ReducedMatrix(sMat.probMatrix, sMat.subMatrixPseudoCounts, sMat.aa2int, sMat.int2aa, sMat.alphabetSize, par.alphabetSize, 2.0);
+            subMat = new ReducedMatrix(sMat.probMatrix, sMat.subMatrixPseudoCounts, sMat.aa2num, sMat.num2aa, sMat.alphabetSize, par.alphabetSize.aminoacids, 2.0);
         }
     }
 
     //seqDbr.readMmapedDataInMemory();
-    const size_t KMER_SIZE = par.kmerSize;
-    size_t chooseTopKmer = par.kmersPerSequence;
-    float chooseTopKmerScale = par.kmersPerSequenceScale;
 
     // memoryLimit in bytes
     size_t memoryLimit;
@@ -656,18 +606,18 @@ int kmermatcherInner(Parameters& par, DBReader<unsigned int>& seqDbr) {
         memoryLimit = static_cast<size_t>(Util::getTotalSystemMemory() * 0.9);
     }
     Debug(Debug::INFO) << "\n";
-    size_t totalKmers = computeKmerCount(seqDbr, KMER_SIZE, chooseTopKmer, chooseTopKmerScale);
+    float kmersPerSequenceScale = (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) ?
+                                        par.kmersPerSequenceScale.nucleotides : par.kmersPerSequenceScale.aminoacids;
+    size_t totalKmers = computeKmerCount(seqDbr, par.kmerSize, par.kmersPerSequence, kmersPerSequenceScale);
     size_t totalSizeNeeded = computeMemoryNeededLinearfilter<T>(totalKmers);
-    Debug(Debug::INFO) << "Estimated memory consumption " << totalSizeNeeded/1024/1024 << " MB\n";
     // compute splits
     size_t splits = static_cast<size_t>(std::ceil(static_cast<float>(totalSizeNeeded) / memoryLimit));
-//    size_t splits = 2;
-    if (splits > 1) {
-        // security buffer
-        splits += 1;
-    }
+    size_t totalKmersPerSplit = std::max(static_cast<size_t>(1024+1),
+                                         static_cast<size_t>(std::min(totalSizeNeeded, memoryLimit)/sizeof(KmerPosition<T>))+1);
+
+    std::vector<std::pair<size_t, size_t>> hashRanges = setupKmerSplits<T>(par, subMat, seqDbr, totalKmersPerSplit, splits);
     if(splits > 1){
-        Debug(Debug::INFO) << "Process file into " << splits << " parts\n";
+        Debug(Debug::INFO) << "Process file into " << hashRanges.size() << " parts\n";
     }
     std::vector<std::string> splitFiles;
     KmerPosition<T> *hashSeqPair = NULL;
@@ -693,7 +643,10 @@ int kmermatcherInner(Parameters& par, DBReader<unsigned int>& seqDbr) {
 
     for(size_t split = fromSplit; split < fromSplit+splitCount; split++) {
         std::string splitFileName = par.db2 + "_split_" +SSTR(split);
-        hashSeqPair = doComputation<T>(totalKmers, split, splits, splitFileName, seqDbr, par, subMat, KMER_SIZE, chooseTopKmer, par.adjustKmerLength, chooseTopKmerScale);
+        int range=MathUtil::ceilIntDivision(USHRT_MAX+1, static_cast<int>(splits));
+        size_t rangeFrom = split*range;
+        size_t rangeTo = (splits == 1) ? SIZE_T_MAX : splits*range+range;
+        hashSeqPair = doComputation<T>(totalKmers, rangeFrom, rangeTo, splitFileName, seqDbr, par, subMat);
     }
     MPI_Barrier(MPI_COMM_WORLD);
     if(mpiRank == 0){
@@ -703,12 +656,13 @@ int kmermatcherInner(Parameters& par, DBReader<unsigned int>& seqDbr) {
         }
     }
 #else
-    for(size_t split = 0; split < splits; split++) {
+    for(size_t split = 0; split < hashRanges.size(); split++) {
         std::string splitFileName = par.db2 + "_split_" +SSTR(split);
+        Debug(Debug::INFO) << "Generate k-mers list for " << (split+1) <<" split\n";
 
         std::string splitFileNameDone = splitFileName + ".done";
         if(FileUtil::fileExists(splitFileNameDone.c_str()) == false){
-            hashSeqPair = doComputation<T>(totalKmers, split, splits, splitFileName, seqDbr, par, subMat, KMER_SIZE, chooseTopKmer, par.adjustKmerLength, chooseTopKmerScale);
+            hashSeqPair = doComputation<T>(totalKmersPerSplit, hashRanges[split].first, hashRanges[split].second, splitFileName, seqDbr, par, subMat);
         }
 
         splitFiles.push_back(splitFileName);
@@ -737,9 +691,9 @@ int kmermatcherInner(Parameters& par, DBReader<unsigned int>& seqDbr) {
             }
         } else {
             if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)) {
-                writeKmerMatcherResult<Parameters::DBTYPE_NUCLEOTIDES>(dbw, hashSeqPair, totalKmers, repSequence, 1);
+                writeKmerMatcherResult<Parameters::DBTYPE_NUCLEOTIDES>(dbw, hashSeqPair, totalKmersPerSplit, repSequence, 1);
             }else{
-                writeKmerMatcherResult<Parameters::DBTYPE_AMINO_ACIDS>(dbw, hashSeqPair, totalKmers, repSequence, 1);
+                writeKmerMatcherResult<Parameters::DBTYPE_AMINO_ACIDS>(dbw, hashSeqPair, totalKmersPerSplit, repSequence, 1);
             }
         }
         Debug(Debug::INFO) << "Time for fill: " << timer.lap() << "\n";
@@ -777,6 +731,50 @@ int kmermatcherInner(Parameters& par, DBReader<unsigned int>& seqDbr) {
     return EXIT_SUCCESS;
 }
 
+template <typename T>
+std::vector<std::pair<size_t, size_t>> setupKmerSplits(Parameters &par, BaseMatrix * subMat, DBReader<unsigned int> &seqDbr, size_t totalKmers, size_t splits){
+    std::vector<std::pair<size_t, size_t>> hashRanges;
+    if (splits > 1) {
+        Debug(Debug::INFO) << "Not enough memory to process at once need to split\n";
+        // compute exact k-mer dist
+        size_t * hashDist = new size_t[USHRT_MAX+1];
+        memset(hashDist, 0 , sizeof(size_t) * (USHRT_MAX+1));
+        if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)){
+            fillKmerPositionArray<Parameters::DBTYPE_NUCLEOTIDES, T>(NULL, SIZE_T_MAX, seqDbr, par, subMat, true, 0, SIZE_T_MAX, hashDist);
+        }else{
+            fillKmerPositionArray<Parameters::DBTYPE_AMINO_ACIDS, T>(NULL, SIZE_T_MAX, seqDbr, par, subMat, true, 0, SIZE_T_MAX, hashDist);
+        }
+        seqDbr.remapData();
+        // figure out if machine has enough memory to run this job
+        size_t maxBucketSize = 0;
+        for(size_t i = 0; i < (USHRT_MAX+1); i++) {
+            if(maxBucketSize < hashDist[i]){
+                maxBucketSize = hashDist[i];
+            }
+        }
+        if(maxBucketSize > totalKmers){
+            Debug(Debug::INFO) << "Not enough memory to run the kmermatcher. Minimum is at least " << maxBucketSize* sizeof(KmerPosition<T>) << " bytes\n";
+            EXIT(EXIT_FAILURE);
+        }
+        // define splits
+        size_t currBucketSize = 0;
+        size_t currBucketStart = 0;
+        for(size_t i = 0; i < (USHRT_MAX+1); i++){
+            if(currBucketSize+hashDist[i] >= totalKmers){
+                hashRanges.emplace_back(currBucketStart, i - 1);
+                currBucketSize = 0;
+                currBucketStart = i;
+            }
+            currBucketSize+=hashDist[i];
+        }
+        hashRanges.emplace_back(currBucketStart, (USHRT_MAX+1));
+        delete [] hashDist;
+    }else{
+        hashRanges.emplace_back(0, SIZE_T_MAX);
+    }
+    return hashRanges;
+}
+
 int kmermatcher(int argc, const char **argv, const Command &command) {
     MMseqsMPI::init(argc, argv);
 
@@ -869,33 +867,29 @@ void writeKmerMatcherResult(DBWriter & dbw,
                 prefResultsOutString.append(buffer, len);
             }
             unsigned int targetId = hashSeqPair[kmerPos].id;
-            unsigned short diagonal = hashSeqPair[kmerPos].pos;
+            T diagonal = hashSeqPair[kmerPos].pos;
             size_t kmerOffset = 0;
-            short prevDiagonal = diagonal;
+            T prevDiagonal = diagonal;
             size_t maxDiagonal = 0;
             size_t diagonalCnt = 0;
             size_t topScore =0;
             int bestReverMask = reverMask;
             // compute best diagonal and score for every group of target sequences
-
             while(lastTargetId != targetId
                   && kmerPos+kmerOffset < threadOffsets[thread+1]
-                  && hashSeqPair[kmerPos+kmerOffset].id == targetId
-                  && ((TYPE ==Parameters::DBTYPE_NUCLEOTIDES)? BIT_CLEAR(hashSeqPair[kmerPos+kmerOffset].kmer, 63):
-                      hashSeqPair[kmerPos+kmerOffset].kmer) == repSeqId){
-
-                 if(prevDiagonal == hashSeqPair[kmerPos+kmerOffset].pos){
-                     diagonalCnt++;
-                 }else{
-                     diagonalCnt = 1;
-                 }
-                 if(diagonalCnt >= maxDiagonal){
-                     diagonal = hashSeqPair[kmerPos+kmerOffset].pos;
-                     maxDiagonal = diagonalCnt;
-                     if(TYPE == Parameters::DBTYPE_NUCLEOTIDES){
-                         bestReverMask = BIT_CHECK(hashSeqPair[kmerPos+kmerOffset].kmer, 63) == false;
-                     }
-                 }
+                  && hashSeqPair[kmerPos+kmerOffset].id == targetId){
+                if(prevDiagonal == hashSeqPair[kmerPos+kmerOffset].pos){
+                    diagonalCnt++;
+                }else{
+                    diagonalCnt = 1;
+                }
+                if(diagonalCnt >= maxDiagonal){
+                    diagonal = hashSeqPair[kmerPos+kmerOffset].pos;
+                    maxDiagonal = diagonalCnt;
+                    if(TYPE == Parameters::DBTYPE_NUCLEOTIDES){
+                        bestReverMask = BIT_CHECK(hashSeqPair[kmerPos+kmerOffset].kmer, 63) == false;
+                    }
+                }
                 prevDiagonal = hashSeqPair[kmerPos+kmerOffset].pos;
                 kmerOffset++;
                 topScore++;
@@ -963,12 +957,19 @@ void mergeKmerFilesAndOutput(DBWriter & dbw,
     for(size_t file = 0; file < tmpFiles.size(); file++){
         files[file] = FileUtil::openFileOrDie(tmpFiles[file].c_str(),"r",true);
         size_t dataSize;
-        entries[file]    = (T*)FileUtil::mmapFile(files[file], &dataSize);
+        struct stat sb;
+        fstat(fileno(files[file]) , &sb);
+        if(sb.st_size > 0){
+            entries[file]    = (T*)FileUtil::mmapFile(files[file], &dataSize);
 #if HAVE_POSIX_MADVISE
-        if (posix_madvise (entries[file], dataSize, POSIX_MADV_SEQUENTIAL) != 0){
-            Debug(Debug::ERROR) << "posix_madvise returned an error for file " << tmpFiles[file] << "\n";
-        }
+            if (posix_madvise (entries[file], dataSize, POSIX_MADV_SEQUENTIAL) != 0){
+                Debug(Debug::ERROR) << "posix_madvise returned an error for file " << tmpFiles[file] << "\n";
+            }
 #endif
+        }else{
+            dataSize = 0;
+        }
+
         dataSizes[file]  = dataSize;
         entrySizes[file] = dataSize/sizeof(T);
     }
@@ -1083,7 +1084,7 @@ void mergeKmerFilesAndOutput(DBWriter & dbw,
     }
     for(size_t file = 0; file < tmpFiles.size(); file++) {
         fclose(files[file]);
-        if(munmap((void*)entries[file], dataSizes[file]) < 0){
+        if(dataSizes[file] > 0 && munmap((void*)entries[file], dataSizes[file]) < 0){
             Debug(Debug::ERROR) << "Failed to munmap memory dataSize=" << dataSizes[file] <<"\n";
             EXIT(EXIT_FAILURE);
         }
@@ -1188,8 +1189,8 @@ void writeKmersToDisk(std::string tmpFile, KmerPosition<seqLenType> *hashSeqPair
 void setKmerLengthAndAlphabet(Parameters &parameters, size_t aaDbSize, int seqTyp) {
     if(Parameters::isEqualDbtype(seqTyp, Parameters::DBTYPE_NUCLEOTIDES)){
         if(parameters.kmerSize == 0) {
-            parameters.kmerSize = std::max(15, static_cast<int>(log(static_cast<float>(aaDbSize))/log(4)));
-            parameters.alphabetSize = 5;
+            parameters.kmerSize = std::max(17, static_cast<int>(log(static_cast<float>(aaDbSize))/log(4)));
+            parameters.alphabetSize.nucleotides = 5;
 
         }
         if(parameters.kmersPerSequence == 0){
@@ -1199,13 +1200,13 @@ void setKmerLengthAndAlphabet(Parameters &parameters, size_t aaDbSize, int seqTy
         if(parameters.kmerSize == 0){
             if((parameters.seqIdThr+0.001)>=0.99){
                 parameters.kmerSize = 14;
-                parameters.alphabetSize = 21;
+                parameters.alphabetSize.aminoacids = 21;
             }else if((parameters.seqIdThr+0.001)>=0.9){
                 parameters.kmerSize = 14;
-                parameters.alphabetSize = 13;
+                parameters.alphabetSize.aminoacids = 13;
             }else{
                 parameters.kmerSize = std::max(10, static_cast<int>(log(static_cast<float>(aaDbSize))/log(8.7)));
-                parameters.alphabetSize = 13;
+                parameters.alphabetSize.aminoacids = 13;
             }
         }
         if(parameters.kmersPerSequence == 0){
@@ -1214,54 +1215,18 @@ void setKmerLengthAndAlphabet(Parameters &parameters, size_t aaDbSize, int seqTy
     }
 }
 
-template std::pair<size_t, size_t>  fillKmerPositionArray<0, short>(KmerPosition<short> * hashSeqPair,
-                                                             DBReader<unsigned int> &seqDbr,
-                                                             Parameters & par, BaseMatrix * subMat,
-                                                             size_t KMER_SIZE, size_t chooseTopKmer,
-                                                             bool includeIdenticalKmer, size_t splits, size_t split,
-                                                             size_t pickNBest,
-                                                             bool adjustKmerLength,
-                                                             float chooseTopKmerScale);
-template std::pair<size_t, size_t>  fillKmerPositionArray<1, short>(KmerPosition<short> * hashSeqPair,
-                                                             DBReader<unsigned int> &seqDbr,
-                                                             Parameters & par, BaseMatrix * subMat,
-                                                             size_t KMER_SIZE, size_t chooseTopKmer,
-                                                             bool includeIdenticalKmer, size_t splits, size_t split,
-                                                             size_t pickNBest,
-                                                             bool adjustKmerLength,
-                                                             float chooseTopKmerScale);
-template std::pair<size_t, size_t>  fillKmerPositionArray<2, short>(KmerPosition<short> * hashSeqPair,
-                                                             DBReader<unsigned int> &seqDbr,
-                                                             Parameters & par, BaseMatrix * subMat,
-                                                             size_t KMER_SIZE, size_t chooseTopKmer,
-                                                             bool includeIdenticalKmer, size_t splits, size_t split,
-                                                             size_t pickNBest,
-                                                             bool adjustKmerLength,
-                                                             float chooseTopKmerScale);
-template std::pair<size_t, size_t>  fillKmerPositionArray<0, int>(KmerPosition<int> * hashSeqPair,
-                                                             DBReader<unsigned int> &seqDbr,
-                                                             Parameters & par, BaseMatrix * subMat,
-                                                             size_t KMER_SIZE, size_t chooseTopKmer,
-                                                             bool includeIdenticalKmer, size_t splits, size_t split,
-                                                             size_t pickNBest,
-                                                             bool adjustKmerLength,
-                                                             float chooseTopKmerScale);
-template std::pair<size_t, size_t>  fillKmerPositionArray<1, int>(KmerPosition <int>* hashSeqPair,
-                                                             DBReader<unsigned int> &seqDbr,
-                                                             Parameters & par, BaseMatrix * subMat,
-                                                             size_t KMER_SIZE, size_t chooseTopKmer,
-                                                             bool includeIdenticalKmer, size_t splits, size_t split,
-                                                             size_t pickNBest,
-                                                             bool adjustKmerLength,
-                                                             float chooseTopKmerScale);
-template std::pair<size_t, size_t>  fillKmerPositionArray<2, int>(KmerPosition< int> * hashSeqPair,
-                                                             DBReader<unsigned int> &seqDbr,
-                                                             Parameters & par, BaseMatrix * subMat,
-                                                             size_t KMER_SIZE, size_t chooseTopKmer,
-                                                             bool includeIdenticalKmer, size_t splits, size_t split,
-                                                             size_t pickNBest,
-                                                             bool adjustKmerLength,
-                                                             float chooseTopKmerScale);
+template std::pair<size_t, size_t>  fillKmerPositionArray<0, short>(KmerPosition<short> * kmerArray, size_t kmerArraySize, DBReader<unsigned int> &seqDbr,
+                                                                    Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution);
+template std::pair<size_t, size_t>  fillKmerPositionArray<1, short>(KmerPosition<short> * kmerArray, size_t kmerArraySize, DBReader<unsigned int> &seqDbr,
+                                                                    Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution);
+template std::pair<size_t, size_t>  fillKmerPositionArray<2, short>(KmerPosition<short> * kmerArray, size_t kmerArraySize, DBReader<unsigned int> &seqDbr,
+                                                                    Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution);
+template std::pair<size_t, size_t>  fillKmerPositionArray<0, int>(KmerPosition<int> * kmerArray, size_t kmerArraySize, DBReader<unsigned int> &seqDbr,
+                                                                  Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution);
+template std::pair<size_t, size_t>  fillKmerPositionArray<1, int>(KmerPosition <int>* kmerArray, size_t kmerArraySize, DBReader<unsigned int> &seqDbr,
+                                                                  Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution);
+template std::pair<size_t, size_t>  fillKmerPositionArray<2, int>(KmerPosition< int> * kmerArray, size_t kmerArraySize, DBReader<unsigned int> &seqDbr,
+                                                                  Parameters & par, BaseMatrix * subMat, bool hashWholeSequence, size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution);
 
 template KmerPosition<short> *initKmerPositionMemory(size_t size);
 template KmerPosition<int> *initKmerPositionMemory(size_t size);
@@ -1269,4 +1234,7 @@ template KmerPosition<int> *initKmerPositionMemory(size_t size);
 template size_t computeMemoryNeededLinearfilter<short>(size_t totalKmer);
 template size_t computeMemoryNeededLinearfilter<int>(size_t totalKmer);
 
+template std::vector<std::pair<size_t, size_t>>  setupKmerSplits<short>(Parameters &par, BaseMatrix * subMat, DBReader<unsigned int> &seqDbr, size_t totalKmers, size_t splits);
+template std::vector<std::pair<size_t, size_t>>  setupKmerSplits<int>(Parameters &par, BaseMatrix * subMat, DBReader<unsigned int> &seqDbr, size_t totalKmers, size_t splits);
+
 #undef SIZE_T_MAX
diff --git a/src/linclust/kmermatcher.h b/src/linclust/kmermatcher.h
index 741cf4f..d342208 100644
--- a/src/linclust/kmermatcher.h
+++ b/src/linclust/kmermatcher.h
@@ -7,6 +7,46 @@
 #include "Parameters.h"
 #include "BaseMatrix.h"
 
+
+struct SequencePosition{
+    unsigned short score;
+    size_t kmer;
+    unsigned int pos;
+    static bool compareByScore(const SequencePosition &first, const SequencePosition &second){
+        if(first.score < second.score)
+            return true;
+        if(second.score < first.score)
+            return false;
+        if(first.kmer < second.kmer)
+            return true;
+        if(second.kmer < first.kmer)
+            return false;
+        if(first.pos < second.pos)
+            return true;
+        if(second.pos < first.pos)
+            return false;
+        return false;
+    }
+    static bool compareByScoreReverse(const SequencePosition &first, const SequencePosition &second){
+        if(first.score < second.score)
+            return true;
+        if(second.score < first.score)
+            return false;
+
+        size_t firstKmer  = BIT_SET(first.kmer, 63);
+        size_t secondKmer = BIT_SET(second.kmer, 63);
+        if(firstKmer < secondKmer)
+            return true;
+        if(secondKmer < firstKmer)
+            return false;
+        if(first.pos < second.pos)
+            return true;
+        if(second.pos < first.pos)
+            return false;
+        return false;
+    }
+};
+
 template <typename T>
 struct __attribute__((__packed__))KmerPosition {
     size_t kmer;
@@ -173,23 +213,29 @@ template <int TYPE, typename T>
 void writeKmerMatcherResult(DBWriter & dbw, KmerPosition<T> *hashSeqPair, size_t totalKmers,
                             std::vector<char> &repSequence, size_t threads);
 
+
 template <typename T>
 KmerPosition<T> * doComputation(size_t totalKmers, size_t split, size_t splits, std::string splitFile,
-                             DBReader<unsigned int> & seqDbr, Parameters & par, BaseMatrix  * subMat,
-                             size_t KMER_SIZE, size_t chooseTopKmer, float chooseTopKmerScale = 0.0);
+                                DBReader<unsigned int> & seqDbr, Parameters & par, BaseMatrix  * subMat,
+                                size_t KMER_SIZE, size_t chooseTopKmer, float chooseTopKmerScale = 0.0);
 template <typename T>
 KmerPosition<T> *initKmerPositionMemory(size_t size);
 
 template <int TYPE, typename T>
-std::pair<size_t, size_t>  fillKmerPositionArray(KmerPosition<T> * hashSeqPair, DBReader<unsigned int> &seqDbr,
-                             Parameters & par, BaseMatrix * subMat,
-                             const size_t KMER_SIZE, size_t chooseTopKmer,
-                             bool includeIdenticalKmer, size_t splits, size_t split, size_t pickNBest,
-                             bool adjustLength, float chooseTopKmerScale = 0.0);
+std::pair<size_t, size_t>  fillKmerPositionArray(KmerPosition<T> * kmerArray, size_t kmerArraySize, DBReader<unsigned int> &seqDbr,
+                                                 Parameters & par, BaseMatrix * subMat, bool hashWholeSequence,
+                                                 size_t hashStartRange, size_t hashEndRange, size_t * hashDistribution);
+
+
+void maskSequence(int maskMode, int maskLowerCase,
+                  Sequence &seq, int maskLetter, ProbabilityMatrix * probMatrix);
 
 template <typename T>
 size_t computeMemoryNeededLinearfilter(size_t totalKmer);
 
+template <typename T>
+std::vector<std::pair<size_t, size_t>> setupKmerSplits(Parameters &par, BaseMatrix * subMat, DBReader<unsigned int> &seqDbr, size_t totalKmers, size_t splits);
+
 size_t computeKmerCount(DBReader<unsigned int> &reader, size_t KMER_SIZE, size_t chooseTopKmer,
                         float chooseTopKmerScale = 0.0);
 
@@ -197,11 +243,6 @@ void setLinearFilterDefault(Parameters *p);
 
 size_t computeMemoryNeededLinearfilter(size_t totalKmer);
 
-unsigned circ_hash(const int * x, unsigned length, const unsigned rol);
-
-unsigned circ_hash_next(const int * x, unsigned length, int x_first, short unsigned h, const unsigned rol);
-
-
 
 #undef SIZE_T_MAX
 
diff --git a/src/linclust/kmersearch.cpp b/src/linclust/kmersearch.cpp
index 03583e8..8fb0e37 100644
--- a/src/linclust/kmersearch.cpp
+++ b/src/linclust/kmersearch.cpp
@@ -21,43 +21,27 @@
 #define SIZE_T_MAX ((size_t) -1)
 #endif
 
-KmerSearch::ExtractKmerAndSortResult KmerSearch::extractKmerAndSort(size_t totalKmers, size_t split, size_t splits, DBReader<unsigned int> & seqDbr,
-                                                                 Parameters & par, BaseMatrix  * subMat, size_t KMER_SIZE, size_t chooseTopKmer, size_t pickNBest, bool adjustLength) {
-    Debug(Debug::INFO) << "Generate k-mers list " << split <<"\n";
-
-    size_t splitKmerCount = totalKmers;
-    if(splits > 1){
-        size_t memoryLimit;
-        if (par.splitMemoryLimit > 0) {
-            memoryLimit = static_cast<size_t>(par.splitMemoryLimit);
-        } else {
-            memoryLimit = static_cast<size_t>(Util::getTotalSystemMemory() * 0.9);
-        }
-        // we do not really know how much memory is needed. So this is our best choice
-        splitKmerCount = (memoryLimit / sizeof(KmerPosition<short>));
-    }
+KmerSearch::ExtractKmerAndSortResult KmerSearch::extractKmerAndSort(size_t totalKmers, size_t hashStartRange, size_t hashEndRange, DBReader<unsigned int> & seqDbr,
+                                                                    Parameters & par, BaseMatrix  * subMat) {
 
-    KmerPosition<short> * hashSeqPair = initKmerPositionMemory<short>(splitKmerCount*pickNBest);
+    KmerPosition<short> * hashSeqPair = initKmerPositionMemory<short>(totalKmers);
     Timer timer;
     size_t elementsToSort;
-    if(pickNBest > 1){
-        std::pair<size_t, size_t> ret = fillKmerPositionArray<Parameters::DBTYPE_HMM_PROFILE,short>(hashSeqPair, seqDbr, par, subMat, KMER_SIZE,
-                                                                               chooseTopKmer, false, splits, split, pickNBest, false);
+    if(par.pickNbest > 1){
+        std::pair<size_t, size_t> ret = fillKmerPositionArray<Parameters::DBTYPE_HMM_PROFILE,short>(hashSeqPair, totalKmers, seqDbr, par, subMat, false, hashStartRange, hashEndRange, NULL);
         elementsToSort = ret.first;
     } else if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)){
-        std::pair<size_t, size_t> ret = fillKmerPositionArray<Parameters::DBTYPE_NUCLEOTIDES,short>(hashSeqPair, seqDbr, par, subMat, KMER_SIZE,
-                                                                               chooseTopKmer, false, splits, split, 1, adjustLength);
+        std::pair<size_t, size_t> ret = fillKmerPositionArray<Parameters::DBTYPE_NUCLEOTIDES,short>(hashSeqPair, totalKmers, seqDbr, par, subMat, false, hashStartRange, hashEndRange, NULL);
         elementsToSort = ret.first;
-        KMER_SIZE = ret.second;
-        Debug(Debug::INFO) << "\nAdjusted k-mer length " << KMER_SIZE << "\n";
+        par.kmerSize = ret.second;
+        Debug(Debug::INFO) << "\nAdjusted k-mer length " << par.kmerSize << "\n";
     }else {
-        std::pair<size_t, size_t> ret = fillKmerPositionArray<Parameters::DBTYPE_AMINO_ACIDS, short>(hashSeqPair, seqDbr, par, subMat, KMER_SIZE,
-                                                                               chooseTopKmer, false, splits, split, 1, false);
+        std::pair<size_t, size_t> ret = fillKmerPositionArray<Parameters::DBTYPE_AMINO_ACIDS, short>(hashSeqPair, totalKmers, seqDbr, par, subMat, false, hashStartRange, hashEndRange, NULL);
         elementsToSort = ret.first;
 
     }
     Debug(Debug::INFO) << "\nTime for fill: " << timer.lap() << "\n";
-    if(splits == 1){
+    if(hashEndRange == SIZE_T_MAX){
         seqDbr.unmapData();
     }
 
@@ -72,7 +56,7 @@ KmerSearch::ExtractKmerAndSortResult KmerSearch::extractKmerAndSort(size_t total
 
     Debug(Debug::INFO) << "Time for sort: " << timer.lap() << "\n";
 
-    return ExtractKmerAndSortResult(elementsToSort, hashSeqPair, KMER_SIZE);
+    return ExtractKmerAndSortResult(elementsToSort, hashSeqPair, par.kmerSize);
 }
 
 template <int TYPE>
@@ -131,7 +115,6 @@ void KmerSearch::writeResult(DBWriter & dbw, KmerPosition<short> *kmers, size_t
 
         hit_t h;
         h.seqId = prevHitId;
-        bestRevertMask = (repSeqId == prevHitId) ? 0 : bestRevertMask;
         h.prefScore =  (bestRevertMask) ? -topScore : topScore;
         h.diagonal =  bestDiagonal;
         int len = QueryMatcher::prefilterHitToBuffer(buffer, h);
@@ -155,7 +138,6 @@ int kmersearch(int argc, const char **argv, const Command &command) {
     par.parseParameters(argc, argv, command, true, 0, MMseqsParameter::COMMAND_CLUSTLINEAR);
     int targetSeqType;
     int adjustedKmerSize = 0;
-    bool isAdjustedKmerLen = false;
     if (Parameters::isEqualDbtype(FileUtil::parseDbType(par.db2.c_str()), Parameters::DBTYPE_INDEX_DB) == false) {
         Debug(Debug::ERROR) << "Create index before calling kmersearch with mmseqs createlinindex.\n";
         EXIT(EXIT_FAILURE);
@@ -172,9 +154,9 @@ int kmersearch(int argc, const char **argv, const Command &command) {
         }
     }
     if(par.PARAM_ALPH_SIZE.wasSet){
-        if(data.alphabetSize != par.alphabetSize){
-            Debug(Debug::ERROR) << "Index was created with --alph-size  " << data.alphabetSize << " but the prefilter was called with --alph-size " << par.alphabetSize << "!\n";
-            Debug(Debug::ERROR) << "createindex --alph-size " << par.alphabetSize << "\n";
+        if(data.alphabetSize != (Parameters::isEqualDbtype(data.seqType, Parameters::DBTYPE_AMINO_ACIDS)? par.alphabetSize.aminoacids:par.alphabetSize.nucleotides)){
+            Debug(Debug::ERROR) << "Index was created with --alph-size  " << data.alphabetSize << " but the prefilter was called with --alph-size " << MultiParam<int>::format(par.alphabetSize) << "!\n";
+            Debug(Debug::ERROR) << "createindex --alph-size " << MultiParam<int>::format(par.alphabetSize) << "\n";
             EXIT(EXIT_FAILURE);
         }
     }
@@ -192,7 +174,6 @@ int kmersearch(int argc, const char **argv, const Command &command) {
     par.maxSeqLen = data.maxSeqLength;
     // Reuse the compBiasCorr field to store the adjustedKmerSize, It is not needed in the linsearch
     adjustedKmerSize = data.compBiasCorr;
-    isAdjustedKmerLen = data.kmerSize != adjustedKmerSize;
 
     DBReader<unsigned int> queryDbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
     queryDbr.open(DBReader<unsigned int>::NOSORT);
@@ -207,8 +188,6 @@ int kmersearch(int argc, const char **argv, const Command &command) {
     par.printParameters(command.cmd, argc, argv, *command.params);
 
     //queryDbr.readMmapedDataInMemory();
-    const size_t KMER_SIZE = par.kmerSize;
-    size_t chooseTopKmer = par.kmersPerSequence;
 
     // memoryLimit in bytes
     size_t memoryLimit;
@@ -217,41 +196,42 @@ int kmersearch(int argc, const char **argv, const Command &command) {
     } else {
         memoryLimit = static_cast<size_t>(Util::getTotalSystemMemory() * 0.9);
     }
-
-    size_t totalKmers = computeKmerCount(queryDbr, KMER_SIZE, chooseTopKmer);
+    float kmersPerSequenceScale = (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) ?
+                                  par.kmersPerSequenceScale.nucleotides : par.kmersPerSequenceScale.aminoacids;
+    size_t totalKmers = computeKmerCount(queryDbr, par.kmerSize, par.kmersPerSequence, kmersPerSequenceScale);
     size_t totalSizeNeeded = computeMemoryNeededLinearfilter<short>(totalKmers);
-    Debug(Debug::INFO) << "Estimated memory consumption " << totalSizeNeeded/1024/1024 << " MB\n";
 
     BaseMatrix *subMat;
     if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) {
         subMat = new NucleotideMatrix(par.seedScoringMatrixFile.nucleotides, 1.0, 0.0);
     } else {
-        if (par.alphabetSize == 21) {
+        if (par.alphabetSize.aminoacids == 21) {
             subMat = new SubstitutionMatrix(par.seedScoringMatrixFile.aminoacids, 8.0, -0.2);
         } else {
             SubstitutionMatrix sMat(par.seedScoringMatrixFile.aminoacids, 8.0, -0.2);
-            subMat = new ReducedMatrix(sMat.probMatrix, sMat.subMatrixPseudoCounts, sMat.aa2int, sMat.int2aa, sMat.alphabetSize, par.alphabetSize, 8.0);
+            subMat = new ReducedMatrix(sMat.probMatrix, sMat.subMatrixPseudoCounts, sMat.aa2num, sMat.num2aa, sMat.alphabetSize, par.alphabetSize.aminoacids, 8.0);
         }
     }
 
     // compute splits
     size_t splits = static_cast<size_t>(std::ceil(static_cast<float>(totalSizeNeeded) / memoryLimit));
-//    size_t splits = 2;
-    if (splits > 1) {
-//         security buffer
-        splits += 1;
-    }
+    size_t totalKmersPerSplit = std::max(static_cast<size_t>(1024+1),
+                                         static_cast<size_t>(std::min(totalSizeNeeded, memoryLimit)/sizeof(KmerPosition<short>))+1);
+
+    std::vector<std::pair<size_t, size_t>> hashRanges = setupKmerSplits<short>(par, subMat, queryDbr, totalKmersPerSplit, splits);
+
     int outDbType = (Parameters::isEqualDbtype(queryDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)) ? Parameters::DBTYPE_PREFILTER_REV_RES : Parameters::DBTYPE_PREFILTER_RES;
-    Debug(Debug::INFO) << "Process file into " << splits << " parts\n";
+    Debug(Debug::INFO) << "Process file into " << hashRanges.size() << " parts\n";
 
     std::vector<std::string> splitFiles;
-    for (size_t split = 0; split < splits; split++) {
+    for (size_t split = 0; split < hashRanges.size(); split++) {
         tidxdbr.remapData();
         char *entriesData = tidxdbr.getDataUncompressed(tidxdbr.getId(PrefilteringIndexReader::ENTRIES));
         char *entriesOffsetsData = tidxdbr.getDataUncompressed(tidxdbr.getId(PrefilteringIndexReader::ENTRIESOFFSETS));
         int64_t entriesNum = *((int64_t *) tidxdbr.getDataUncompressed(tidxdbr.getId(PrefilteringIndexReader::ENTRIESNUM)));
         int64_t entriesGridSize = *((int64_t *) tidxdbr.getDataUncompressed(tidxdbr.getId(PrefilteringIndexReader::ENTRIESGRIDSIZE)));
-        KmerIndex kmerIndex(par.alphabetSize, adjustedKmerSize, entriesData, entriesOffsetsData, entriesNum, entriesGridSize);
+        int alphabetSize = (Parameters::isEqualDbtype(queryDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)) ? par.alphabetSize.nucleotides:par.alphabetSize.aminoacids;
+        KmerIndex kmerIndex(alphabetSize, adjustedKmerSize, entriesData, entriesOffsetsData, entriesNum, entriesGridSize);
 //        kmerIndex.printIndex<Parameters::DBTYPE_NUCLEOTIDES>(subMat);
         std::pair<std::string, std::string> tmpFiles;
         if (splits > 1) {
@@ -260,22 +240,19 @@ int kmersearch(int argc, const char **argv, const Command &command) {
             tmpFiles = std::make_pair(par.db3, par.db3Index);
         }
         splitFiles.push_back(tmpFiles.first);
-        
+
         std::string splitFileNameDone = tmpFiles.first + ".done";
         if(FileUtil::fileExists(splitFileNameDone.c_str()) == false) {
-            KmerSearch::ExtractKmerAndSortResult sortedKmers = KmerSearch::extractKmerAndSort(totalKmers, split,
-                                                                                              splits, queryDbr, par,
-                                                                                              subMat,
-                                                                                              KMER_SIZE, chooseTopKmer,
-                                                                                              par.pickNbest,
-                                                                                              isAdjustedKmerLen);
+            KmerSearch::ExtractKmerAndSortResult sortedKmers = KmerSearch::extractKmerAndSort(totalKmersPerSplit, hashRanges[split].first,
+                                                                                              hashRanges[split].second, queryDbr, par,
+                                                                                              subMat);
             std::pair<KmerPosition<short> *, size_t> result;
             if (Parameters::isEqualDbtype(queryDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)) {
                 result = KmerSearch::searchInIndex<Parameters::DBTYPE_NUCLEOTIDES>(sortedKmers.kmers,
-                                                                                   sortedKmers.kmerCount, kmerIndex);
+                                                                                   sortedKmers.kmerCount, kmerIndex, par.resultDirection);
             } else {
                 result = KmerSearch::searchInIndex<Parameters::DBTYPE_AMINO_ACIDS>(sortedKmers.kmers,
-                                                                                   sortedKmers.kmerCount, kmerIndex);
+                                                                                   sortedKmers.kmerCount, kmerIndex, par.resultDirection);
             }
 
             KmerPosition<short> *kmers = result.first;
@@ -292,7 +269,7 @@ int kmersearch(int argc, const char **argv, const Command &command) {
             } else {
                 if (Parameters::isEqualDbtype(queryDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)) {
                     writeKmersToDisk<Parameters::DBTYPE_NUCLEOTIDES, KmerEntryRev, short>(tmpFiles.first, kmers,
-                            kmerCount );
+                                                                                          kmerCount );
                 } else {
                     writeKmersToDisk<Parameters::DBTYPE_AMINO_ACIDS, KmerEntry, short>(tmpFiles.first, kmers, kmerCount );
                 }
@@ -322,9 +299,9 @@ int kmersearch(int argc, const char **argv, const Command &command) {
     return EXIT_SUCCESS;
 }
 template  <int TYPE>
-std::pair<KmerPosition<short> *,size_t > KmerSearch::searchInIndex( KmerPosition<short> *kmers, size_t kmersSize, KmerIndex &kmerIndex) {
+std::pair<KmerPosition<short> *,size_t > KmerSearch::searchInIndex(KmerPosition<short> *kmers, size_t kmersSize, KmerIndex &kmerIndex, int resultDirection) {
     Timer timer;
-
+    bool queryTargetSwitched = (resultDirection == Parameters::PARAM_RESULT_DIRECTION_TARGET);
     kmerIndex.reset();
     KmerIndex::KmerEntry currTargetKmer;
     bool isDone = false;
@@ -395,48 +372,48 @@ std::pair<KmerPosition<short> *,size_t > KmerSearch::searchInIndex( KmerPosition
                 //  10 Same here, we can revert query to match the not inverted target
                 //  11 Both are reverted so no problem!
                 //  So we need just 1 bit of information to encode all four states
-                bool targetIsReverse = (BIT_CHECK(currQueryKmer->kmer, 63) == false);
-                bool repIsReverse = (BIT_CHECK(currTargetKmer.kmer, 63) == false);
+                bool targetIsReverse = (queryTargetSwitched) ? (BIT_CHECK(currQueryKmer->kmer, 63) == false) :
+                                       (BIT_CHECK(currTargetKmer.kmer, 63) == false);
+                bool repIsReverse = (queryTargetSwitched) ? (BIT_CHECK(currTargetKmer.kmer, 63) == false) :
+                                    (BIT_CHECK(currQueryKmer->kmer, 63) == false);
                 bool queryNeedsToBeRev = false;
                 // we now need 2 byte of information (00),(01),(10),(11)
                 // we need to flip the coordinates of the query
-                short queryPos=0;
-                short targetPos=0;
+                short queryPos = currTargetKmer.pos;
+                short targetPos= currQueryKmer->pos;;
                 // revert kmer in query hits normal kmer in target
                 // we need revert the query
                 if (repIsReverse == true && targetIsReverse == false){
-                    queryPos = currTargetKmer.pos;
-                    targetPos = currQueryKmer->pos;
                     queryNeedsToBeRev = true;
                     // both k-mers were extracted on the reverse strand
                     // this is equal to both are extract on the forward strand
                     // we just need to offset the position to the forward strand
                 }else if (repIsReverse == true && targetIsReverse == true){
-                    queryPos = (currTargetKmer.seqLen - 1) - currTargetKmer.pos;
+                    queryPos  = (currTargetKmer.seqLen - 1) - currTargetKmer.pos;
                     targetPos = (currQueryKmer->seqLen - 1) - currQueryKmer->pos;
                     queryNeedsToBeRev = false;
                     // query is not revers but target k-mer is reverse
                     // instead of reverting the target, we revert the query and offset the the query/target position
                 }else if (repIsReverse == false && targetIsReverse == true){
-                    queryPos = (currTargetKmer.seqLen - 1) - currTargetKmer.pos;
+                    queryPos  = (currTargetKmer.seqLen - 1) - currTargetKmer.pos;
                     targetPos = (currQueryKmer->seqLen - 1) - currQueryKmer->pos;
                     queryNeedsToBeRev = true;
                     // both are forward, everything is good here
-                }else{
-                    queryPos = currTargetKmer.pos;
-                    targetPos =  currQueryKmer->pos;
-                    queryNeedsToBeRev = false;
                 }
-                (kmers+writePos)->pos = queryPos - targetPos;
-                size_t id = (queryNeedsToBeRev) ? BIT_CLEAR(static_cast<size_t >(currTargetKmer.id), 63) : BIT_SET(static_cast<size_t >(currTargetKmer.id), 63);
+                (kmers+writePos)->pos = (queryTargetSwitched) ? queryPos - targetPos : targetPos - queryPos;
+                size_t id = (queryTargetSwitched) ? currTargetKmer.id : currQueryKmer->id;
+                id = (queryNeedsToBeRev) ? BIT_CLEAR(static_cast<size_t >(id), 63) :
+                     BIT_SET(static_cast<size_t >(id), 63);
                 (kmers+writePos)->kmer = id;
+                (kmers+writePos)->id   = (queryTargetSwitched) ? currQueryKmer->id : currTargetKmer.id;
             }else{
                 // i - j
-                (kmers+writePos)->kmer= currTargetKmer.id;
+                (kmers+writePos)->kmer = (queryTargetSwitched) ? currTargetKmer.id : currQueryKmer->id;
+                (kmers+writePos)->id   = (queryTargetSwitched) ? currQueryKmer->id : currTargetKmer.id;
 //                std::cout << currTargetKmer.pos - currQueryKmer->pos << "\t" << currTargetKmer.pos << "\t" << currQueryKmer->pos << std::endl;
-                (kmers+writePos)->pos = currTargetKmer.pos - currQueryKmer->pos;
+                (kmers+writePos)->pos  = (queryTargetSwitched) ? currTargetKmer.pos - currQueryKmer->pos :
+                                         currQueryKmer->pos - currTargetKmer.pos;
             }
-            (kmers+writePos)->id = currQueryKmer->id;
             (kmers+writePos)->seqLen = currQueryKmer->seqLen;
 
             writePos++;
@@ -457,7 +434,7 @@ std::pair<KmerPosition<short> *,size_t > KmerSearch::searchInIndex( KmerPosition
     return std::make_pair(kmers, writePos);
 }
 
-template std::pair<KmerPosition<short> *,size_t > KmerSearch::searchInIndex<0>( KmerPosition<short> *kmers, size_t kmersSize, KmerIndex &kmerIndex);
-template std::pair<KmerPosition<short> *,size_t > KmerSearch::searchInIndex<1>( KmerPosition<short> *kmers, size_t kmersSize, KmerIndex &kmerIndex);
+template std::pair<KmerPosition<short> *,size_t > KmerSearch::searchInIndex<0>( KmerPosition<short> *kmers, size_t kmersSize, KmerIndex &kmerIndex, int resultDirection);
+template std::pair<KmerPosition<short> *,size_t > KmerSearch::searchInIndex<1>( KmerPosition<short> *kmers, size_t kmersSize, KmerIndex &kmerIndex, int resultDirection);
 
 #undef SIZE_T_MAX
diff --git a/src/linclust/kmersearch.h b/src/linclust/kmersearch.h
index cbdef04..f949829 100644
--- a/src/linclust/kmersearch.h
+++ b/src/linclust/kmersearch.h
@@ -12,7 +12,7 @@ class KmerSearch{
 
 public:
     template  <int TYPE>
-    static std::pair<KmerPosition<short> *,size_t > searchInIndex( KmerPosition<short> *kmers, size_t kmersSize, KmerIndex &kmerIndex);
+    static std::pair<KmerPosition<short> *,size_t > searchInIndex( KmerPosition<short> *kmers, size_t kmersSize, KmerIndex &kmerIndex, int resultDirection);
 
     template  <int TYPE>
     static void writeResult(DBWriter & dbw, KmerPosition<short> *kmers, size_t kmerCount);
@@ -20,14 +20,13 @@ class KmerSearch{
 
     struct ExtractKmerAndSortResult{
         ExtractKmerAndSortResult(size_t kmerCount, KmerPosition<short> * kmers, size_t adjustedKmer)
-        : kmerCount(kmerCount), kmers(kmers), adjustedKmer(adjustedKmer)  {}
+                : kmerCount(kmerCount), kmers(kmers), adjustedKmer(adjustedKmer)  {}
         size_t kmerCount;
         KmerPosition<short> * kmers;
         size_t adjustedKmer;
     };
     static ExtractKmerAndSortResult extractKmerAndSort(size_t splitKmerCount, size_t split, size_t splits,
-                                                                DBReader<unsigned int> &seqDbr, Parameters &par, BaseMatrix *subMat,
-                                                                size_t KMER_SIZE, size_t chooseTopKmer, size_t pickNBest, bool adjustKmerLength);
+                                                       DBReader<unsigned int> &seqDbr, Parameters &par, BaseMatrix *subMat);
 };
 
 
diff --git a/src/multihit/MultiHitSearch.cpp b/src/multihit/MultiHitSearch.cpp
index 54c6043..9394c23 100644
--- a/src/multihit/MultiHitSearch.cpp
+++ b/src/multihit/MultiHitSearch.cpp
@@ -29,26 +29,21 @@ int multihitsearch(int argc, const char **argv, const Command &command) {
     Parameters &par = Parameters::getInstance();
     setMultiHitSearchWorkflowDefaults(&par);
 
-    par.overrideParameterDescription((Command &) command, par.PARAM_MAX_REJECTED.uniqid, NULL, NULL,
-                                     par.PARAM_MAX_REJECTED.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &) command, par.PARAM_DB_OUTPUT.uniqid, NULL, NULL,
-                                     par.PARAM_DB_OUTPUT.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &) command, par.PARAM_OVERLAP.uniqid, NULL, NULL,
-                                     par.PARAM_OVERLAP.category | MMseqsParameter::COMMAND_EXPERT);
-
+    par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_DB_OUTPUT.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_OVERLAP.addCategory(MMseqsParameter::COMMAND_EXPERT);
     for (size_t i = 0; i < par.extractorfs.size(); i++){
-        par.overrideParameterDescription((Command &)command, par.extractorfs[i]->uniqid, NULL, NULL, par.extractorfs[i]->category | MMseqsParameter::COMMAND_EXPERT);
+        par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
     for (size_t i = 0; i < par.translatenucs.size(); i++){
-        par.overrideParameterDescription((Command &)command, par.translatenucs[i]->uniqid, NULL, NULL, par.translatenucs[i]->category | MMseqsParameter::COMMAND_EXPERT);
+        par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
     for (size_t i = 0; i < par.result2profile.size(); i++){
-        par.overrideParameterDescription((Command &)command, par.result2profile[i]->uniqid, NULL, NULL, par.result2profile[i]->category | MMseqsParameter::COMMAND_EXPERT);
+        par.result2profile[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
-    par.overrideParameterDescription((Command &) command, par.PARAM_THREADS.uniqid, NULL, NULL,
-                                     par.PARAM_THREADS.category & ~MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &) command, par.PARAM_V.uniqid, NULL, NULL,
-                                     par.PARAM_V.category & ~MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT);
 
     par.parseParameters(argc, argv, command, true, 0, 0);
 
@@ -80,6 +75,7 @@ int multihitsearch(int argc, const char **argv, const Command &command) {
     cmd.addVariable("SEARCH_PAR", par.createParameterString(par.searchworkflow).c_str());
     cmd.addVariable("BESTHITBYSET_PAR", par.createParameterString(par.besthitbyset).c_str());
     cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str());
+    cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str());
 
     FileUtil::writeFile(tmpDir + "/multihitsearch.sh", multihitsearch_sh, multihitsearch_sh_len);
     std::string program(tmpDir + "/multihitsearch.sh");
diff --git a/src/multihit/combinepvalperset.cpp b/src/multihit/combinepvalperset.cpp
index 2c91f9a..30f0ee8 100644
--- a/src/multihit/combinepvalperset.cpp
+++ b/src/multihit/combinepvalperset.cpp
@@ -166,21 +166,41 @@ class PvalueAggregator : public Aggregation {
             updatedPval = exp(sumLogPval);   
         }
 
-        //3) the P-values of the truncated product method 
+        //3) the P-values of the (modified) truncated product method 
         else if(aggregationMode == Parameters::AGGREGATION_MODE_TRUNCATED_PRODUCT){
+            //new theory: taking the best hit regardless of threshold and (from second hit on)sum of how much it surpassed threshold
             unsigned int orfCount = Util::fast_atoi<unsigned int>(querySizeReader->getDataByDBKey(querySetKey, thread_idx));
             double logPvalThreshold = log(alpha / (orfCount + 1));
-            double sumLogPval = 0;
+            double minLogPval = 0;
+            double sumLogPval = 0; 
+            size_t k = 0;
             for (size_t i = 0; i < dataToAggregate.size(); ++i) {
                 double logPvalue = std::strtod(dataToAggregate[i][1].c_str(), NULL);
+                if (logPvalue < minLogPval) {
+                    if (logPvalue == 0) {
+                        //to avoid -0.0
+                        minLogPval = logPvalue;
+                    }
+                    else {minLogPval = -logPvalue;}
+                }
                 if (logPvalue < logPvalThreshold) {
-                    sumLogPval += logPvalue;
+                    //sum up the part exceeding logThreshold, add a minus to make score positive
+                    sumLogPval -= logPvalue - logPvalThreshold;
+                    k++;
                 }
             }
-            updatedPval = exp(sumLogPval); 
+            if(k == 0){
+                //if no hit passed thr, take the -log of best hit pval as score
+                buffer.append(SSTR(minLogPval));
+                return buffer;
+            }
+            else {
+                //if one or more hits passed thr
+                buffer.append(SSTR(sumLogPval - logPvalThreshold));
+                return buffer;
+            }
         }
 
-
         
         else {
             Debug(Debug::ERROR) << "Invalid aggregation function!\n";
diff --git a/src/prefiltering/CacheFriendlyOperations.cpp b/src/prefiltering/CacheFriendlyOperations.cpp
index 8a16341..7eb00b9 100644
--- a/src/prefiltering/CacheFriendlyOperations.cpp
+++ b/src/prefiltering/CacheFriendlyOperations.cpp
@@ -1,107 +1,107 @@
 #include "CacheFriendlyOperations.h"
-#include <new>
-#include <iostream>
-#include "IndexTable.h"
 #include "Util.h"
 
-template<unsigned int BINSIZE> CacheFriendlyOperations<BINSIZE>::CacheFriendlyOperations(size_t maxElement, size_t initBinSize) {
+#include <cmath>
+
+template<unsigned int BINSIZE>
+CacheFriendlyOperations<BINSIZE>::CacheFriendlyOperations(size_t maxElement, size_t initBinSize) {
     // find nearest upper power of 2^(x)
     size_t size = pow(2, ceil(log(maxElement)/log(2)));
-    size = std::max(size  >> MASK_0_5_BIT, (size_t) 1); // space needed in bit array
+    size = std::max(size >> MASK_0_5_BIT, (size_t) 1); // space needed in bit array
     duplicateBitArraySize = size;
     duplicateBitArray = new(std::nothrow) unsigned char[size];
-    Util::checkAllocation(duplicateBitArray, "Can not allocate duplicateBitArray memory in CacheFriendlyOperations");
+    Util::checkAllocation(duplicateBitArray, "Cannot allocate duplicateBitArray memory in CacheFriendlyOperations");
     memset(duplicateBitArray, 0, duplicateBitArraySize * sizeof(unsigned char));
+
     // find nearest upper power of 2^(x)
     initBinSize = pow(2, ceil(log(initBinSize)/log(2)));
     binSize = initBinSize;
     tmpElementBuffer = new(std::nothrow) TmpResult[binSize];
-    Util::checkAllocation(tmpElementBuffer, "Can not allocate tmpElementBuffer memory in CacheFriendlyOperations");
+    Util::checkAllocation(tmpElementBuffer, "Cannot allocate tmpElementBuffer memory in CacheFriendlyOperations");
 
-    bins = new CounterResult*[BINCOUNT];
-    binDataFrame = new(std::nothrow) CounterResult[BINCOUNT * binSize];
-    Util::checkAllocation(binDataFrame, "Can not allocate binDataFrame memory in CacheFriendlyOperations");
+    bins = new(std::nothrow) CounterResult*[BINCOUNT];
+    Util::checkAllocation(bins, "Cannot allocate bins memory in CacheFriendlyOperations");
 
+    binDataFrame = new(std::nothrow) CounterResult[BINCOUNT * binSize];
+    Util::checkAllocation(binDataFrame, "Cannot allocate binDataFrame memory in CacheFriendlyOperations");
 }
 
-template<unsigned int BINSIZE> CacheFriendlyOperations<BINSIZE>::~CacheFriendlyOperations<BINSIZE>(){
-    delete [] duplicateBitArray;
-    delete [] binDataFrame;
-    delete [] tmpElementBuffer;
-    delete [] bins;
+template<unsigned int BINSIZE>
+CacheFriendlyOperations<BINSIZE>::~CacheFriendlyOperations<BINSIZE>(){
+    delete[] duplicateBitArray;
+    delete[] binDataFrame;
+    delete[] tmpElementBuffer;
+    delete[] bins;
 }
 
-template<unsigned int BINSIZE> size_t CacheFriendlyOperations<BINSIZE>::countElements(IndexEntryLocal **input, CounterResult *output,
-                                                                              size_t outputSize, unsigned short indexFrom, unsigned short indexTo,
-                                                                              bool computeTotalScore)
-{
-    newStart:
-    setupBinPointer(bins, BINCOUNT, binDataFrame, binSize);
-    CounterResult * lastPosition = (binDataFrame + BINCOUNT * binSize) - 1;
-
-    for(unsigned int i = indexFrom; i < indexTo; i++){
-        const size_t N = input[i + 1] - input[i];
-        hashIndexEntry(i, input[i], N, this->bins, lastPosition);
-    }
-    if(checkForOverflowAndResizeArray(bins, BINCOUNT, binSize) == true) // overflowed occurred
-        goto newStart;
-    return findDuplicates(this->bins, this->BINCOUNT, output, outputSize, computeTotalScore);
+template<unsigned int BINSIZE>
+size_t CacheFriendlyOperations<BINSIZE>::findDuplicates(IndexEntryLocal **input, CounterResult *output,
+        size_t outputSize, unsigned short indexFrom, unsigned short indexTo, bool computeTotalScore) {
+    do {
+        setupBinPointer();
+        CounterResult *lastPosition = (binDataFrame + BINCOUNT * binSize) - 1;
+        for (unsigned int i = indexFrom; i < indexTo; ++i) {
+            const size_t N = input[i + 1] - input[i];
+            hashIndexEntry(i, input[i], N, lastPosition);
+        }
+    } while (checkForOverflowAndResizeArray(true) == true); // overflowed occurred
+    return findDuplicates(output, outputSize, computeTotalScore);
 }
 
-template<unsigned int BINSIZE> size_t CacheFriendlyOperations<BINSIZE>::mergeElementsByScore(CounterResult *inputOutputArray, const size_t N) {
-    newStart:
-    setupBinPointer(bins, BINCOUNT, binDataFrame, binSize);
-    hashElements(inputOutputArray, N, this->bins);
-    if(checkForOverflowAndResizeArray(bins, BINCOUNT, binSize) == true) // overflowed occurred
-        goto newStart;
-    return mergeDuplicates(this->bins, this->BINCOUNT, inputOutputArray);
+template<unsigned int BINSIZE>
+size_t CacheFriendlyOperations<BINSIZE>::mergeElementsByScore(CounterResult *inputOutputArray, const size_t N) {
+    do {
+        setupBinPointer();
+        hashElements(inputOutputArray, N);
+    } while(checkForOverflowAndResizeArray(false) == true); // overflowed occurred
+    return mergeScoreDuplicates(inputOutputArray);
 }
 
-template<unsigned int BINSIZE> size_t CacheFriendlyOperations<BINSIZE>::mergeElementsByDiagonal(CounterResult *inputOutputArray, const size_t N) {
-    newStart:
-    setupBinPointer(bins, BINCOUNT, binDataFrame, binSize);
-    hashElements(inputOutputArray, N, this->bins);
-    if(checkForOverflowAndResizeArray(bins, BINCOUNT, binSize) == true) // overflowed occurred
-        goto newStart;
-    return mergeDiagonalDuplicates(this->bins, this->BINCOUNT, inputOutputArray);
+template<unsigned int BINSIZE>
+size_t CacheFriendlyOperations<BINSIZE>::mergeElementsByDiagonal(CounterResult *inputOutputArray, const size_t N) {
+    do {
+        setupBinPointer();
+        hashElements(inputOutputArray, N);
+    } while(checkForOverflowAndResizeArray(false) == true); // overflowed occurred
+    return mergeDiagonalDuplicates(inputOutputArray);
 }
 
-template<unsigned int BINSIZE> size_t CacheFriendlyOperations<BINSIZE>::keepMaxScoreElementOnly(CounterResult *inputOutputArray, const size_t N) {
-    newStart:
-    setupBinPointer(bins, BINCOUNT, binDataFrame, binSize);
-    hashElements(inputOutputArray, N, this->bins);
-    if(checkForOverflowAndResizeArray(bins, BINCOUNT, binSize) == true) // overflowed occurred
-        goto newStart;
-    return keepMaxElement(this->bins, this->BINCOUNT, inputOutputArray);
+template<unsigned int BINSIZE>
+size_t CacheFriendlyOperations<BINSIZE>::keepMaxScoreElementOnly(CounterResult *inputOutputArray, const size_t N) {
+    do {
+        setupBinPointer();
+        hashElements(inputOutputArray, N);
+    } while (checkForOverflowAndResizeArray(false) == true); // overflowed occurred
+    return keepMaxElement(inputOutputArray);
 }
 
 
-template<unsigned int BINSIZE> size_t CacheFriendlyOperations<BINSIZE>::mergeDiagonalDuplicates(CounterResult **bins, unsigned int binCount,
-                                                                                        CounterResult * output) {
+template<unsigned int BINSIZE>
+size_t CacheFriendlyOperations<BINSIZE>::mergeDiagonalDuplicates(CounterResult *output) {
     size_t doubleElementCount = 0;
     const CounterResult *bin_ref_pointer = binDataFrame;
+    // duplicateBitArray is already zero'd from findDuplicates
 
-    for (size_t bin = 0; bin < binCount; bin++) {
+    for (size_t bin = 0; bin < BINCOUNT; bin++) {
         const CounterResult *binStartPos = (bin_ref_pointer + bin * binSize);
         const size_t currBinSize = (bins[bin] - binStartPos);
         size_t n = currBinSize - 1;
         // write diagonals + 1 in reverse order in the byte array
-        while ( n != static_cast<size_t>(-1) )
-        {
+        while (n != static_cast<size_t>(-1)) {
             const unsigned int element = binStartPos[n].id >> (MASK_0_5_BIT);
-            duplicateBitArray[element] = static_cast<unsigned char>(tmpElementBuffer[n].diagonal) + 1;
+            duplicateBitArray[element] = static_cast<unsigned char>(binStartPos[n].diagonal) + 1;
             --n;
         }
         // combine diagonals
         for (size_t n = 0; n < currBinSize; n++) {
-            const CounterResult element = binStartPos[n];
+            const CounterResult &element = binStartPos[n];
             const unsigned int hashBinElement = element.id >> (MASK_0_5_BIT);
-            output[doubleElementCount].id    = element.id;
+            output[doubleElementCount].id = element.id;
             output[doubleElementCount].count = element.count;
             output[doubleElementCount].diagonal = element.diagonal;
 //            std::cout << output[doubleElementCount].id << " " << (int)output[doubleElementCount].count << " " << (int)static_cast<unsigned char>(output[doubleElementCount].diagonal) << std::endl;
             // memory overflow can not happen since input array = output array
-            doubleElementCount += (duplicateBitArray[hashBinElement] != static_cast<unsigned char>(tmpElementBuffer[n].diagonal)) ? 1 : 0;
+            doubleElementCount += (duplicateBitArray[hashBinElement] != static_cast<unsigned char>(binStartPos[n].diagonal)) ? 1 : 0;
 
             duplicateBitArray[hashBinElement] = static_cast<unsigned char>(element.diagonal);
         }
@@ -109,18 +109,18 @@ template<unsigned int BINSIZE> size_t CacheFriendlyOperations<BINSIZE>::mergeDia
     return doubleElementCount;
 }
 
-template<unsigned int BINSIZE> size_t CacheFriendlyOperations<BINSIZE>::mergeDuplicates(CounterResult **bins, unsigned int binCount,
-                                                                                CounterResult * output) {
+template<unsigned int BINSIZE>
+size_t CacheFriendlyOperations<BINSIZE>::mergeScoreDuplicates(CounterResult *output) {
     size_t doubleElementCount = 0;
     const CounterResult *bin_ref_pointer = binDataFrame;
-    memset(duplicateBitArray, 0, duplicateBitArraySize * sizeof(unsigned char));
+    // duplicateBitArray is already zero'd from findDuplicates
 
-    for (size_t bin = 0; bin < binCount; bin++) {
+    for (size_t bin = 0; bin < BINCOUNT; bin++) {
         const CounterResult *binStartPos = (bin_ref_pointer + bin * binSize);
         const size_t currBinSize = (bins[bin] - binStartPos);
         // merge double hits
         for (size_t n = 0; n < currBinSize; n++) {
-            const CounterResult element = binStartPos[n];
+            const CounterResult &element = binStartPos[n];
             const unsigned int hashBinElement = element.id >> (MASK_0_5_BIT);
             const unsigned char currScore = element.count;
             const unsigned char dbScore = duplicateBitArray[hashBinElement];
@@ -131,25 +131,23 @@ template<unsigned int BINSIZE> size_t CacheFriendlyOperations<BINSIZE>::mergeDup
         for (size_t n = 0; n < currBinSize; n++) {
             const CounterResult element = binStartPos[n];
             const unsigned int hashBinElement = element.id >> (MASK_0_5_BIT);
-            output[doubleElementCount].id    = element.id;
+            output[doubleElementCount].id = element.id;
             output[doubleElementCount].count = duplicateBitArray[hashBinElement];
             output[doubleElementCount].diagonal = element.diagonal;
             // memory overflow can not happen since input array = output array
             doubleElementCount += (UNLIKELY(duplicateBitArray[hashBinElement] != 0  ) ) ? 1 : 0;
-            duplicateBitArray[hashBinElement] = static_cast<unsigned char>(tmpElementBuffer[n].diagonal);
+            duplicateBitArray[hashBinElement] = static_cast<unsigned char>(binStartPos[n].diagonal);
         }
     }
     return doubleElementCount;
 }
 
-template<unsigned int BINSIZE> size_t CacheFriendlyOperations<BINSIZE>::findDuplicates(CounterResult **bins,
-                                                                               unsigned int binCount,
-                                                                               CounterResult * output,
-                                                                               size_t outputSize,
-                                                                               bool computeTotalScore) {
+template<unsigned int BINSIZE>
+size_t CacheFriendlyOperations<BINSIZE>::findDuplicates(CounterResult *output, size_t outputSize, bool computeTotalScore) {
+    memset(duplicateBitArray, 0, duplicateBitArraySize * sizeof(unsigned char));
     size_t doubleElementCount = 0;
-    const CounterResult * bin_ref_pointer = binDataFrame;
-    for (size_t bin = 0; bin < binCount; bin++) {
+    const CounterResult *bin_ref_pointer = binDataFrame;
+    for (size_t bin = 0; bin < BINCOUNT; bin++) {
         const CounterResult *binStartPos = (bin_ref_pointer + bin * binSize);
         const size_t currBinSize = (bins[bin] - binStartPos);
         size_t elementCount = 0;
@@ -157,8 +155,8 @@ template<unsigned int BINSIZE> size_t CacheFriendlyOperations<BINSIZE>::findDupl
         for (size_t n = 0; n < currBinSize; n++) {
             const CounterResult element = binStartPos[n];
             const unsigned int hashBinElement = element.id >> (MASK_0_5_BIT);
-            //const unsigned int byteArrayPos = hashBinElement >> 3; // equal to  hashBinElement / 8
-            //const unsigned char bitPosMask = 1 << (hashBinElement & 7);  // 7 = 00000111
+            //const unsigned int byteArrayPos = hashBinElement >> 3; // equal to hashBinElement / 8
+            //const unsigned char bitPosMask = 1 << (hashBinElement & 7); // 7 = 00000111
             // check if duplicate element was found before
             const unsigned char currDiagonal = element.diagonal;
             //currDiagonal = (currDiagonal == 0) ? 200 : currDiagonal;
@@ -170,11 +168,11 @@ template<unsigned int BINSIZE> size_t CacheFriendlyOperations<BINSIZE>::findDupl
             duplicateBitArray[hashBinElement] = currDiagonal;
         }
         // check for overflow
-        if(doubleElementCount + elementCount >= outputSize){
+        if (doubleElementCount + elementCount >= outputSize) {
             return doubleElementCount;
         }
-//        // set memory to zero
-        if(computeTotalScore){
+        // set memory to zero
+        if (computeTotalScore) {
             for (size_t n = 0; n < elementCount; n++) {
                 const unsigned int element = tmpElementBuffer[n].id >> (MASK_0_5_BIT);
                 duplicateBitArray[element] = 0;
@@ -196,12 +194,11 @@ template<unsigned int BINSIZE> size_t CacheFriendlyOperations<BINSIZE>::findDupl
                 doubleElementCount += (duplicateBitArray[hashBinElement] != 0) ? 1 : 0;
                 duplicateBitArray[hashBinElement] = 0;
             }
-        }else{
+        } else {
             // set duplicate bit array to first diagonal + 1
             // so (duplicateBitArray[hashBinElement] != tmpElementBuffer[n].diagonal) is true
             size_t n = elementCount - 1;
-            while ( n != static_cast<size_t>(-1) )
-            {
+            while (n != static_cast<size_t>(-1)) {
                 const unsigned int element = tmpElementBuffer[n].id >> (MASK_0_5_BIT);
                 duplicateBitArray[element] = static_cast<unsigned char>(tmpElementBuffer[n].diagonal) + 1;
                 --n;
@@ -212,130 +209,122 @@ template<unsigned int BINSIZE> size_t CacheFriendlyOperations<BINSIZE>::findDupl
                 const unsigned int element = tmpElementBuffer[n].id;
                 const unsigned int hashBinElement = element >> (MASK_0_5_BIT);
                 output[doubleElementCount].id    = element;
-                output[doubleElementCount].count = tmpElementBuffer[n].score;
+                output[doubleElementCount].count = 0;
                 output[doubleElementCount].diagonal = tmpElementBuffer[n].diagonal;
     //            const unsigned char diagonal = static_cast<unsigned char>(tmpElementBuffer[n].diagonal);
                 // memory overflow can not happen since input array = output array
     //            if(duplicateBitArray[hashBinElement] != tmpElementBuffer[n].diagonal){
     //                std::cout << "seq="<< output[doubleElementCount].id << "\tDiag=" << (int) output[doubleElementCount].diagonal
-    //                <<  " dup.Array=" << (int)duplicateBitArray[hashBinElement] << " tmp.Arr="<< (int)tmpElementBuffer[n].diagonal << std::endl;
+    //                << " dup.Array=" << (int)duplicateBitArray[hashBinElement] << " tmp.Arr="<< (int)tmpElementBuffer[n].diagonal << std::endl;
     //            }
                 doubleElementCount += (duplicateBitArray[hashBinElement] != static_cast<unsigned char>(tmpElementBuffer[n].diagonal)) ? 1 : 0;
-
                 duplicateBitArray[hashBinElement] = static_cast<unsigned char>(tmpElementBuffer[n].diagonal);
             }
         }
         // clean memory faster if current bin size is smaller duplicateBitArraySize
-        if(currBinSize < duplicateBitArraySize/16){
+        if (currBinSize < duplicateBitArraySize/16) {
             for (size_t n = 0; n < currBinSize; n++) {
                 const unsigned int byteArrayPos = binStartPos[n].id >> (MASK_0_5_BIT);
                 duplicateBitArray[byteArrayPos] = 0;
             }
-        }else{
+        } else {
             memset(duplicateBitArray, 0, duplicateBitArraySize * sizeof(unsigned char));
         }
     }
     return doubleElementCount;
 }
 
-template<unsigned int BINSIZE> bool CacheFriendlyOperations<BINSIZE>::checkForOverflowAndResizeArray(CounterResult **bins,
-                                                                                             const unsigned int binCount,
-                                                                                             const size_t binSize) {
-    const CounterResult * bin_ref_pointer = binDataFrame;
-    CounterResult * lastPosition = (binDataFrame + binCount * binSize) - 1;
-    for (size_t bin = 0; bin < binCount; bin++) {
+template<unsigned int BINSIZE>
+bool CacheFriendlyOperations<BINSIZE>::checkForOverflowAndResizeArray(bool includeTmpResult) {
+    const CounterResult *bin_ref_pointer = binDataFrame;
+    CounterResult *lastPosition = (binDataFrame + BINCOUNT * binSize) - 1;
+    for (size_t bin = 0; bin < BINCOUNT; bin++) {
         const CounterResult *binStartPos = (bin_ref_pointer + bin * binSize);
         const size_t n = (bins[bin] - binStartPos);
         // if one bin has more elements than BIN_SIZE
         // or the current bin pointer is at the end of the binDataFrame
-        // reallocate new memory
-        if( n > binSize || bins[bin] >= lastPosition) {
+        // reallocate memory
+        if (n > binSize || bins[bin] >= lastPosition) {
             // overflow detected
             // find nearest upper power of 2^(x)
 //            std::cout << "Found overlow " << n << std::endl;
-            this->binSize = pow(2, ceil(log(binSize + 1)/log(2)));
-            reallocBinMemory(binCount, this->binSize);
+            binSize = pow(2, ceil(log(binSize + 1)/log(2)));
+
+            delete[] binDataFrame;
+            binDataFrame = new(std::nothrow) CounterResult[BINCOUNT * binSize];
+            memset(binDataFrame, 0, sizeof(CounterResult) * binSize * BINCOUNT);
+            Util::checkAllocation(binDataFrame, "Cannot reallocate reallocBinMemory in CacheFriendlyOperations");
+
+            if (includeTmpResult) {
+                delete[] tmpElementBuffer;
+                tmpElementBuffer = new(std::nothrow) TmpResult[binSize];
+                memset(tmpElementBuffer, 0, sizeof(TmpResult) * binSize);
+                Util::checkAllocation(tmpElementBuffer, "Cannot reallocate tmpElementBuffer in CacheFriendlyOperations");
+            }
             return true;
         }
     }
     return false;
 }
 
-template<unsigned int BINSIZE> void CacheFriendlyOperations<BINSIZE>::reallocBinMemory(const unsigned int binCount, const size_t binSize) {
-    delete [] binDataFrame;
-    delete [] tmpElementBuffer;
-    binDataFrame     = new(std::nothrow) CounterResult[binCount * binSize];
-    memset(binDataFrame, 0, sizeof(CounterResult) * binSize * binCount);
-    Util::checkAllocation(binDataFrame, "Can not allocate reallocBinMemory memory in CacheFriendlyOperations::reallocBinMemory");
-    tmpElementBuffer = new(std::nothrow) TmpResult[binSize];
-    memset(tmpElementBuffer, 0, sizeof(TmpResult) * binSize);
-    Util::checkAllocation(tmpElementBuffer, "Can not allocate tmpElementBuffer memory in CacheFriendlyOperations::reallocBinMemory");
-}
-
-template<unsigned int BINSIZE> void CacheFriendlyOperations<BINSIZE>::setupBinPointer(CounterResult **bins, const unsigned int binCount,
-                                                                              CounterResult *binDataFrame, const size_t binSize)
-{
-    // Example binCount = 3
+template<unsigned int BINSIZE>
+void CacheFriendlyOperations<BINSIZE>::setupBinPointer() {
+    // Example BINCOUNT = 3
     // bin start             |-----------------------|-----------------------| bin end
     //    segments[bin_step][0]
     //                            segments[bin_step][1]
     //                                                    segments[bin_step][2]
-    size_t curr_pos = 0;
-    for(size_t bin = 0; bin < binCount; bin++){
-        bins[bin] = binDataFrame + curr_pos;
-        curr_pos += binSize;
+    for (size_t bin = 0; bin < BINCOUNT; bin++) {
+        bins[bin] = binDataFrame + bin * binSize;
     }
 }
 
-template<unsigned int BINSIZE> void CacheFriendlyOperations<BINSIZE>::hashElements(CounterResult *inputArray, size_t N, CounterResult **hashBins)
-{
-    CounterResult * lastPosition = (binDataFrame + BINCOUNT * binSize) - 1;
-    for(size_t n = 0; n < N; n++) {
-        const CounterResult element = inputArray[n];
-        const unsigned int bin_id  = (element.id & MASK_0_5);
-        hashBins[bin_id]->id       = element.id;
-        hashBins[bin_id]->diagonal = element.diagonal;
-        hashBins[bin_id]->count    = element.count;
+template<unsigned int BINSIZE>
+void CacheFriendlyOperations<BINSIZE>::hashElements(CounterResult *inputArray, size_t N) {
+    CounterResult *lastPosition = (binDataFrame + BINCOUNT * binSize) - 1;
+    for (size_t n = 0; n < N; n++) {
+        const CounterResult &element = inputArray[n];
+        const unsigned int bin = (element.id & MASK_0_5);
+        bins[bin]->id       = element.id;
+        bins[bin]->diagonal = element.diagonal;
+        bins[bin]->count    = element.count;
 
         // do not write over boundary of the data frame
-        hashBins[bin_id] += (hashBins[bin_id] >= lastPosition) ? 0 : 1;
+        bins[bin] += (bins[bin] >= lastPosition) ? 0 : 1;
     }
 }
 
-template<unsigned int BINSIZE> void CacheFriendlyOperations<BINSIZE>::hashIndexEntry(unsigned short position_i, IndexEntryLocal *inputArray,
-                                                                             size_t N, CounterResult **hashBins, CounterResult * lastPosition)
-{
-    for(size_t n = 0; n < N; n++) {
-        const IndexEntryLocal element = inputArray[n];
-        const unsigned int bin_id = (element.seqId & MASK_0_5);
-        hashBins[bin_id]->id    = element.seqId;
-        hashBins[bin_id]->diagonal = position_i - element.position_j;
+template<unsigned int BINSIZE>
+void CacheFriendlyOperations<BINSIZE>::hashIndexEntry(unsigned short position_i, IndexEntryLocal *inputArray, size_t N,  CounterResult *lastPosition) {
+    for (size_t n = 0; n < N; n++) {
+        const IndexEntryLocal &element = inputArray[n];
+        const unsigned int bin = (element.seqId & MASK_0_5);
+        bins[bin]->id = element.seqId;
+        bins[bin]->diagonal = position_i - element.position_j;
         // do not write over boundary of the data frame
-//        std::cout << hashBins[bin_id]->id  << " " << position_i << " "
-//        << element.position_j << " " << hashBins[bin_id]->diagonal << " " << position_i - element.position_j   << std::endl;
-        hashBins[bin_id] += (hashBins[bin_id] >= lastPosition) ? 0 : 1;
+        //std::cout << bins[bin]->id << " " << position_i << " " << element.position_j << " " << bins[bin]->diagonal << " " << position_i - element.position_j << std::endl;
+        bins[bin] += (bins[bin] >= lastPosition) ? 0 : 1;
     }
 }
 
-template<unsigned int BINSIZE> size_t CacheFriendlyOperations<BINSIZE>::keepMaxElement(CounterResult **bins,
-                                                                               unsigned int binCount,
-                                                                               CounterResult * output) {
+template<unsigned int BINSIZE>
+size_t CacheFriendlyOperations<BINSIZE>::keepMaxElement(CounterResult *output) {
     size_t doubleElementCount = 0;
     const CounterResult *bin_ref_pointer = binDataFrame;
     memset(duplicateBitArray, 0, duplicateBitArraySize * sizeof(unsigned char));
-    for (size_t bin = 0; bin < binCount; bin++) {
+    for (size_t bin = 0; bin < BINCOUNT; bin++) {
         const CounterResult *binStartPos = (bin_ref_pointer + bin * binSize);
         const size_t currBinSize = (bins[bin] - binStartPos);
         // found max element and store it in duplicateBitArray
         for (size_t n = 0; n < currBinSize; n++) {
-            const CounterResult element = binStartPos[n];
+            const CounterResult &element = binStartPos[n];
             const unsigned int hashBinElement = element.id >> (MASK_0_5_BIT);
             const unsigned char currScore = element.count;
             const unsigned char dbScore = duplicateBitArray[hashBinElement];
             const unsigned char maxScore = (currScore > dbScore) ? currScore : dbScore;
             duplicateBitArray[hashBinElement] = maxScore;
         }
-        // extract final scores and set dubplicateBitArray to 0
+        // extract final scores and set duplicateBitArray to 0
         for (size_t n = 0; n < currBinSize; n++) {
             const CounterResult element = binStartPos[n];
             const unsigned int hashBinElement = element.id >> (MASK_0_5_BIT);
@@ -361,4 +350,4 @@ template class CacheFriendlyOperations<32>;
 template class CacheFriendlyOperations<16>;
 template class CacheFriendlyOperations<8>;
 template class CacheFriendlyOperations<4>;
-template class CacheFriendlyOperations<2>;
\ No newline at end of file
+template class CacheFriendlyOperations<2>;
diff --git a/src/prefiltering/CacheFriendlyOperations.h b/src/prefiltering/CacheFriendlyOperations.h
index 28af0f1..38d9f2a 100644
--- a/src/prefiltering/CacheFriendlyOperations.h
+++ b/src/prefiltering/CacheFriendlyOperations.h
@@ -1,12 +1,9 @@
 #ifndef COUNTIN32ARRAY_H
 #define COUNTIN32ARRAY_H
 
-#include <cstdlib>
-#include <cmath>
-#include <cstring>
 #include "IndexTable.h"
 
-#define IS_REPRESENTIBLE_IN_D_BITS(D, N)                \
+#define IS_REPRESENTIBLE_IN_D_BITS(D, N) \
   (((unsigned long) N >= (1UL << (D - 1)) && (unsigned long) N < (1UL << D)) ? D : -1)
 
 #define BITS_TO_REPRESENT(N)                            \
@@ -46,83 +43,72 @@
                  )                                      \
    )
 
-struct  __attribute__((__packed__))  CounterResult {
-    unsigned int  id;
+struct __attribute__((__packed__)) CounterResult {
+    unsigned int id;
     unsigned short diagonal;
     unsigned char count;
 };
 
-template<unsigned int BINSIZE> class CacheFriendlyOperations{
+template<unsigned int BINSIZE>
+class CacheFriendlyOperations {
 public:
     // 00000000000000000000000111111111
     static const unsigned int MASK_0_5 = BINSIZE - 1;
     static const unsigned int MASK_0_5_BIT = BITS_TO_REPRESENT(MASK_0_5);
 
-    CacheFriendlyOperations(size_t maxElement,
-                    size_t initBinSize);
-
+    CacheFriendlyOperations(size_t maxElement, size_t initBinSize);
     ~CacheFriendlyOperations();
 
-    size_t countElements(IndexEntryLocal **input, CounterResult *output,
-                         size_t outputSize, unsigned short indexFrom, unsigned short indexTo, bool computeTotalScore);
-    // merge elements in CounterResult
-    // assumption is that each element (diagonalMatcher.id) exists maximal two times
+    size_t findDuplicates(IndexEntryLocal **input, CounterResult *output, size_t outputSize, unsigned short indexFrom, unsigned short indexTo, bool computeTotalScore);
+
+    // merge elements in CounterResult assuming that each element (diagonalMatcher.id) exist at most twice
     size_t mergeElementsByScore(CounterResult *inputOutputArray, const size_t N);
 
-    // merge elements in CounterResult by diagonal
-    // it combines elements with same ids that occurs after each other
+    // merge elements in CounterResult by diagonal, combines elements with same ids that occur after each other
     size_t mergeElementsByDiagonal(CounterResult *inputOutputArray, const size_t N);
+
     size_t keepMaxScoreElementOnly(CounterResult *inputOutputArray, const size_t N);
+
 private:
     // this bit array should fit in L1/L2
     size_t duplicateBitArraySize;
-    unsigned char * duplicateBitArray;
+    unsigned char *duplicateBitArray;
     // needed for lower bit hashing function
     const static unsigned int BINCOUNT = MASK_0_5 + 1;
     size_t binSize;
     // pointer for hashing
-    CounterResult ** bins;
+    CounterResult **bins;
     // array to keep the bin elements
-    CounterResult * binDataFrame;
+    CounterResult *binDataFrame;
 
-
-    struct  __attribute__((__packed__))  TmpResult {
-        unsigned int  id;
+    struct __attribute__((__packed__)) TmpResult {
+        unsigned int id;
         unsigned short diagonal;
-        unsigned char score;
     };
     // needed to temporary keep ids
     TmpResult *tmpElementBuffer;
-    // detect if overflow occurs
-    bool checkForOverflowAndResizeArray(CounterResult **bins,
-                                        const unsigned int binCount,
-                                        const size_t binSize);
 
-    // extend the bin size
-    void reallocBinMemory(unsigned int const binCount, size_t const binSize);
+    // detect if overflow occurs
+    bool checkForOverflowAndResizeArray(bool includeTmpResult);
 
     // reset pointer to the bin start pos
-    void setupBinPointer(CounterResult **bins, const unsigned int binCount,
-                         CounterResult *binDataFrame, const size_t binSize);
+    void setupBinPointer();
 
     // hash input array based on MASK_0_5
-    void hashElements(CounterResult *inputArray, size_t N, CounterResult **hashBins);
+    void hashElements(CounterResult *inputArray, size_t N);
 
     // hash index entry and compute diagonal
-    void hashIndexEntry(unsigned short position_i, IndexEntryLocal *inputArray,
-                        size_t N, CounterResult **hashBins, CounterResult * lastPosition);
+    void hashIndexEntry(unsigned short position_i, IndexEntryLocal *inputArray, size_t N, CounterResult *lastPosition);
 
     // detect duplicates in diagonal
-    size_t findDuplicates(CounterResult **bins, unsigned int binCount,
-                          CounterResult * output, size_t outputSize, bool findDuplicates);
+    size_t findDuplicates(CounterResult *output, size_t outputSize, bool computeTotalScore);
 
     // merge by id and combine score
-    size_t mergeDuplicates(CounterResult **bins, unsigned int binCount, CounterResult *output);
+    size_t mergeScoreDuplicates(CounterResult *output);
 
-    //
-    size_t mergeDiagonalDuplicates(CounterResult **bins, unsigned int binCount, CounterResult *output);
+    size_t mergeDiagonalDuplicates(CounterResult *output);
 
-    size_t keepMaxElement(CounterResult **pResult, const unsigned int bincount, CounterResult *pCounterResult);
+    size_t keepMaxElement(CounterResult *output);
 };
 
 #undef BITS_TO_REPRESENT
diff --git a/src/prefiltering/ExtendedSubstitutionMatrix.cpp b/src/prefiltering/ExtendedSubstitutionMatrix.cpp
index 9a3b5e9..7b54296 100644
--- a/src/prefiltering/ExtendedSubstitutionMatrix.cpp
+++ b/src/prefiltering/ExtendedSubstitutionMatrix.cpp
@@ -3,8 +3,6 @@
 #include "Util.h"
 #include "simd.h"
 
-
-#include <iostream>
 #include <iterator>
 #include <cmath>
 #include <cstdlib>
@@ -26,7 +24,7 @@ ScoreMatrix ExtendedSubstitutionMatrix::calcScoreMatrix(const BaseMatrix& matrix
     size_t row_size = size / MAX_ALIGN_INT;
     row_size = (row_size + 1) * MAX_ALIGN_INT; // for SIMD memory alignment
     // create permutation
-    std::vector<std::vector<int> > input(buildInput(kmerSize,alphabetSize));
+    std::vector<std::vector<unsigned char> > input(buildInput(kmerSize,alphabetSize));
     
     // score matrix is O(size^2). 64 is added for SSE
     short * score = (short *) mem_align(MAX_ALIGN_INT, (size * (row_size)) * sizeof(short));
@@ -34,8 +32,8 @@ ScoreMatrix ExtendedSubstitutionMatrix::calcScoreMatrix(const BaseMatrix& matrix
     unsigned int * index = (unsigned int *)mem_align(MAX_ALIGN_INT, (size * (row_size)) * sizeof(unsigned int));
 
 
-    std::vector<std::vector<int> > permutation;
-    std::vector<int> outputTemp;
+    std::vector<std::vector<unsigned char> > permutation;
+    std::vector<unsigned char> outputTemp;
     createCartesianProduct(permutation, outputTemp, input.begin(), input.end());
 #pragma omp parallel
 {
@@ -75,7 +73,7 @@ void ExtendedSubstitutionMatrix::freeScoreMatrix(ScoreMatrix& matrix) {
     free(matrix.index);
 }
 
-short ExtendedSubstitutionMatrix::calcScore(int * i_seq,int * j_seq,size_t seq_size, short **subMatrix){
+short ExtendedSubstitutionMatrix::calcScore(unsigned char * i_seq, unsigned char * j_seq,size_t seq_size, short **subMatrix){
     short score = 0;
     for(size_t i = 0; i < seq_size; i++){
         score += subMatrix[i_seq[i]][j_seq[i]];
@@ -84,11 +82,11 @@ short ExtendedSubstitutionMatrix::calcScore(int * i_seq,int * j_seq,size_t seq_s
 }
 
 // Creates the input
-std::vector<std::vector<int> > ExtendedSubstitutionMatrix::buildInput(size_t dimension,size_t range) {
-    std::vector<std::vector<int> >  dimension_vector;
+std::vector<std::vector<unsigned char> > ExtendedSubstitutionMatrix::buildInput(size_t dimension,size_t range) {
+    std::vector<std::vector<unsigned char> >  dimension_vector;
     
     for(size_t i = 0; i < dimension; i++) {
-        std::vector<int> range_vector;
+        std::vector<unsigned char> range_vector;
         for(size_t j = 0; j < range; j++) {
             range_vector.push_back(j);
         }
@@ -105,10 +103,10 @@ std::vector<std::vector<int> > ExtendedSubstitutionMatrix::buildInput(size_t dim
 //      recurse on next "me"
 //
 void ExtendedSubstitutionMatrix::createCartesianProduct(
-                                                        std::vector<std::vector<int> > & output,  // final result
-                                                        std::vector<int>&  current_result,   // current result
-                                                        std::vector<std::vector<int> >::const_iterator current_input, // current input
-                                                        std::vector<std::vector<int> >::const_iterator end) // final input
+                                                        std::vector<std::vector<unsigned char> > & output,  // final result
+                                                        std::vector<unsigned char>&  current_result,   // current result
+                                                        std::vector<std::vector<unsigned char> >::const_iterator current_input, // current input
+                                                        std::vector<std::vector<unsigned char> >::const_iterator end) // final input
 {
     if(current_input == end) {
         // terminal condition of the recursion. We no longer have
@@ -119,8 +117,8 @@ void ExtendedSubstitutionMatrix::createCartesianProduct(
     }
     
     // need an easy name for my vector-of-ints
-    const std::vector<int>& mevi = *current_input;
-    for(std::vector<int>::const_iterator it = mevi.begin();it != mevi.end();it++) {
+    const std::vector<unsigned char>& mevi = *current_input;
+    for(std::vector<unsigned char>::const_iterator it = mevi.begin();it != mevi.end();it++) {
         current_result.push_back(*it);  // add ME
         createCartesianProduct(output, current_result, current_input+1, end);
         current_result.pop_back(); // clean current result off for next round
diff --git a/src/prefiltering/ExtendedSubstitutionMatrix.h b/src/prefiltering/ExtendedSubstitutionMatrix.h
index cb11d4b..dfeb9dc 100644
--- a/src/prefiltering/ExtendedSubstitutionMatrix.h
+++ b/src/prefiltering/ExtendedSubstitutionMatrix.h
@@ -13,15 +13,16 @@ class ExtendedSubstitutionMatrix
     static ScoreMatrix calcScoreMatrix(const BaseMatrix& matrix, const size_t kmerSize);
     static void freeScoreMatrix(ScoreMatrix& matrix);
 
-    static short calcScore(int * i_seq,int * j_seq,size_t seq_size,short **subMatrix);
+    static short calcScore(unsigned char * i_seq, unsigned char * j_seq,size_t seq_size,short **subMatrix);
 
 private:
-    static std::vector<std::vector<int> > buildInput(size_t dimension,size_t range);
+    static std::vector<std::vector<unsigned char> > buildInput(size_t dimension,size_t range);
+
     static void createCartesianProduct(
-                                std::vector<std::vector<int> > & output,  // final result
-                                std::vector<int>&  current_result,   // current result
-                                std::vector<std::vector<int> >::const_iterator current_input, // current input
-                                std::vector<std::vector<int> >::const_iterator end); // final input
+                                std::vector<std::vector<unsigned char> > & output,  // final result
+                                std::vector<unsigned char>&  current_result,   // current result
+                                std::vector<std::vector<unsigned char> >::const_iterator current_input, // current input
+                                std::vector<std::vector<unsigned char> >::const_iterator end); // final input
 
 };
 #endif
diff --git a/src/prefiltering/IndexBuilder.cpp b/src/prefiltering/IndexBuilder.cpp
index 111d68b..b2e1d04 100644
--- a/src/prefiltering/IndexBuilder.cpp
+++ b/src/prefiltering/IndexBuilder.cpp
@@ -109,9 +109,8 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
             generator->setDivideStrategy(s.profile_matrix);
         }
 
-        unsigned int *buffer = new unsigned int[seq->getMaxLen()];
-        char *charSequence = new char[seq->getMaxLen()];
-
+        unsigned int *buffer = static_cast<unsigned int*>(malloc(seq->getMaxLen() * sizeof(unsigned int)));
+        unsigned int bufferSize = seq->getMaxLen();
         #pragma omp for schedule(dynamic, 100) reduction(+:totalKmerCount, maskedResidues)
         for (size_t id = dbFrom; id < dbTo; id++) {
             progress.updateProgress();
@@ -121,24 +120,24 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
             unsigned int qKey = dbr->getDbKey(id);
 
             s.mapSequence(id - dbFrom, qKey, seqData, dbr->getSeqLen(id));
-
+            if(s.getMaxLen() >= bufferSize ){
+                buffer = static_cast<unsigned int*>(realloc(buffer, s.getMaxLen() * sizeof(unsigned int)));
+                bufferSize = seq->getMaxLen();
+            }
             // count similar or exact k-mers based on sequence type
             if (isProfile) {
                 // Find out if we should also mask profiles
                 totalKmerCount += indexTable->addSimilarKmerCount(&s, generator);
-                (*unmaskedLookup)->addSequence(s.int_consensus_sequence, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
+                (*unmaskedLookup)->addSequence(s.numConsensusSequence, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
             } else {
                 // Do not mask if column state sequences are used
                 if (unmaskedLookup != NULL) {
-                    (*unmaskedLookup)->addSequence(s.int_sequence, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
+                    (*unmaskedLookup)->addSequence(s.numSequence, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
                 }
                 if (mask == true) {
-                    for (int i = 0; i < s.L; ++i) {
-                        charSequence[i] = (char) s.int_sequence[i];
-                    }
                     // s.print();
-                    maskedResidues += tantan::maskSequences(charSequence,
-                                                            charSequence + s.L,
+                    maskedResidues += tantan::maskSequences((char*)s.numSequence,
+                                                            (char*)(s.numSequence + s.L),
                                                             50 /*options.maxCycleLength*/,
                                                             probMatrix->probMatrixPointers,
                                                             0.005 /*options.repeatProb*/,
@@ -147,32 +146,27 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
                                                             0, 0,
                                                             0.9 /*options.minMaskProb*/,
                                                             probMatrix->hardMaskTable);
-
-                    for (int i = 0; i < s.L; i++) {
-                        s.int_sequence[i] = charSequence[i];
-                    }
                 }
 
                 if(maskLowerCaseMode == true && (Parameters::isEqualDbtype(s.getSequenceType(), Parameters::DBTYPE_AMINO_ACIDS) ||
                                                   Parameters::isEqualDbtype(s.getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES))) {
                     const char * charSeq = s.getSeqData();
-                    int maskLetter = subMat.aa2int[(int)'X'];
+                    unsigned char maskLetter = subMat.aa2num[static_cast<int>('X')];
                     for (int i = 0; i < s.L; i++) {
                         bool isLowerCase = (islower(charSeq[i]));
                         maskedResidues += isLowerCase;
-                        s.int_sequence[i] = isLowerCase ? maskLetter : s.int_sequence[i];
+                        s.numSequence[i] = isLowerCase ? maskLetter : s.numSequence[i];
                     }
                 }
                 if(maskedLookup != NULL){
-                    (*maskedLookup)->addSequence(s.int_sequence, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
+                    (*maskedLookup)->addSequence(s.numSequence, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
                 }
 
                 totalKmerCount += indexTable->addKmerCount(&s, &idxer, buffer, kmerThr, idScoreLookup);
             }
         }
 
-        delete[] charSequence;
-        delete[] buffer;
+        free(buffer);
 
         if (generator != NULL) {
             delete generator;
@@ -225,8 +219,8 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
 #endif
         Sequence s(seq->getMaxLen(), seq->getSeqType(), &subMat, seq->getKmerSize(), seq->isSpaced(), false, true, seq->getSpacedKmerPattern());
         Indexer idxer(static_cast<unsigned int>(indexTable->getAlphabetSize()), seq->getKmerSize());
-        IndexEntryLocalTmp *buffer = new IndexEntryLocalTmp[seq->getMaxLen()];
-
+        IndexEntryLocalTmp *buffer = static_cast<IndexEntryLocalTmp *>(malloc( seq->getMaxLen() * sizeof(IndexEntryLocalTmp)));
+        size_t bufferSize = seq->getMaxLen();
         KmerGenerator *generator = NULL;
         if (isProfile) {
             generator = new KmerGenerator(seq->getKmerSize(), indexTable->getAlphabetSize(), kmerThr);
@@ -241,10 +235,10 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
             unsigned int qKey = dbr->getDbKey(id);
             if (isProfile) {
                 s.mapSequence(id - dbFrom, qKey, dbr->getData(id, thread_idx), dbr->getSeqLen(id));
-                indexTable->addSimilarSequence(&s, generator, &idxer);
+                indexTable->addSimilarSequence(&s, generator, &buffer, bufferSize, &idxer);
             } else {
                 s.mapSequence(id - dbFrom, qKey, sequenceLookup->getSequence(id - dbFrom));
-                indexTable->addSequence(&s, &idxer, buffer, kmerThr, idScoreLookup);
+                indexTable->addSequence(&s, &idxer, &buffer, bufferSize, kmerThr, idScoreLookup);
             }
         }
 
@@ -252,7 +246,7 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
             delete generator;
         }
 
-        delete [] buffer;
+        free(buffer);
     }
     if(idScoreLookup!=NULL){
         delete[] idScoreLookup;
diff --git a/src/prefiltering/IndexTable.h b/src/prefiltering/IndexTable.h
index f6970f1..ea8d0a4 100644
--- a/src/prefiltering/IndexTable.h
+++ b/src/prefiltering/IndexTable.h
@@ -16,7 +16,7 @@
 #include "MathUtil.h"
 #include "KmerGenerator.h"
 #include "Parameters.h"
-
+#include <stdlib.h>
 #include <algorithm>
 
 // IndexEntryLocal is an entry with position and seqId for a kmer
@@ -99,7 +99,7 @@ class IndexTable {
 
         //idxer->reset();
         while(s->hasNextKmer()){
-            const int * kmer = s->nextKmer();
+            const unsigned char * kmer = s->nextKmer();
             const std::pair<size_t *, size_t> kmerList = kmerGenerator->generateKmerList(kmer);
 
             //unsigned int kmerIdx = idxer->int2index(kmer, 0, kmerSize);
@@ -132,17 +132,10 @@ class IndexTable {
         size_t countKmer = 0;
         bool removeX = (Parameters::isEqualDbtype(s->getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES) ||
                         Parameters::isEqualDbtype(s->getSequenceType(), Parameters::DBTYPE_AMINO_ACIDS));
-        const int xIndex = s->subMat->aa2int[(int)'X'];
         while(s->hasNextKmer()){
-            const int * kmer = s->nextKmer();
-            if(removeX){
-                int xCount = 0;
-                for(int pos = 0; pos < kmerSize; pos++){
-                    xCount += (kmer[pos] == xIndex);
-                }
-                if(xCount > 0){
-                    continue;
-                }
+            const unsigned char * kmer = s->nextKmer();
+            if(removeX && s->kmerContainsX()){
+                continue;
             }
             if(threshold > 0){
                 int score = 0;
@@ -258,7 +251,7 @@ class IndexTable {
         offsets[0] = 0;
     }
 
-    void printStatistics(char *int2aa) {
+    void printStatistics(char *num2aa) {
         const size_t top_N = 10;
         std::pair<size_t, size_t> topElements[top_N];
         for (size_t j = 0; j < top_N; j++) {
@@ -294,44 +287,50 @@ class IndexTable {
         Debug(Debug::INFO) << "Top " << top_N << " k-mers\n";
         for (size_t j = 0; j < top_N; j++) {
             Debug(Debug::INFO) << "    ";
-            indexer->printKmer(topElements[j].second, kmerSize, int2aa);
+            indexer->printKmer(topElements[j].second, kmerSize, num2aa);
             Debug(Debug::INFO) << "\t" << topElements[j].first << "\n";
         }
     }
 
     // FUNCTIONS TO OVERWRITE
     // add k-mers of the sequence to the index table
-    void addSimilarSequence(Sequence* s, KmerGenerator* kmerGenerator, Indexer * idxer) {
-        std::vector<IndexEntryLocalTmp> buffer;
+    void addSimilarSequence(Sequence* s, KmerGenerator* kmerGenerator, IndexEntryLocalTmp ** buffer, size_t &bufferSize, Indexer * idxer) {
         // iterate over all k-mers of the sequence and add the id of s to the sequence list of the k-mer (tableDummy)
         s->resetCurrPos();
         idxer->reset();
         size_t kmerPos = 0;
         while(s->hasNextKmer()){
-            const int * kmer = s->nextKmer();
+            const unsigned char * kmer = s->nextKmer();
             std::pair<size_t *, size_t> scoreMatrix = kmerGenerator->generateKmerList(kmer);
+            if(kmerPos+scoreMatrix.second >= bufferSize){
+                *buffer = static_cast<IndexEntryLocalTmp*>(realloc(*buffer, sizeof(IndexEntryLocalTmp) * bufferSize*2));
+                bufferSize = bufferSize*2;
+            }
             for(size_t i = 0; i < scoreMatrix.second; i++) {
                 unsigned int kmerIdx = scoreMatrix.first[i];
 
                 // if region got masked do not add kmer
                 if (offsets[kmerIdx + 1] - offsets[kmerIdx] == 0)
                     continue;
-                buffer.push_back(IndexEntryLocalTmp(kmerIdx,s->getId(), s->getCurrentPosition()));
+                (*buffer)[kmerPos].kmer = kmerIdx;
+                (*buffer)[kmerPos].seqId = s->getId();
+                (*buffer)[kmerPos].position_j = s->getCurrentPosition();
                 kmerPos++;
             }
+
         }
 
         if(kmerPos>1){
-            std::sort(buffer.begin(), buffer.end(), IndexEntryLocalTmp::comapreByIdAndPos);
+            std::sort(*buffer, *buffer+kmerPos, IndexEntryLocalTmp::comapreByIdAndPos);
         }
         unsigned int prevKmer = UINT_MAX;
-        for(size_t pos = 0; pos < buffer.size(); pos++){
-            unsigned int kmerIdx = buffer[pos].kmer;
+        for(size_t pos = 0; pos < kmerPos; pos++){
+            unsigned int kmerIdx = (*buffer)[pos].kmer;
             if(kmerIdx != prevKmer){
                 size_t offset = __sync_fetch_and_add(&(offsets[kmerIdx]), 1);
                 IndexEntryLocal *entry = &entries[offset];
-                entry->seqId      = buffer[pos].seqId;
-                entry->position_j = buffer[pos].position_j;
+                entry->seqId      = (*buffer)[pos].seqId;
+                entry->position_j = (*buffer)[pos].position_j;
             }
             prevKmer = kmerIdx;
         }
@@ -339,7 +338,7 @@ class IndexTable {
 
     // add k-mers of the sequence to the index table
     void addSequence (Sequence* s, Indexer * idxer,
-                      IndexEntryLocalTmp * buffer,
+                      IndexEntryLocalTmp ** buffer, size_t bufferSize,
                       int threshold, char * diagonalScore){
         // iterate over all k-mers of the sequence and add the id of s to the sequence list of the k-mer (tableDummy)
         s->resetCurrPos();
@@ -347,17 +346,10 @@ class IndexTable {
         size_t kmerPos = 0;
         bool removeX = (Parameters::isEqualDbtype(s->getSequenceType(), Parameters::DBTYPE_NUCLEOTIDES) ||
                         Parameters::isEqualDbtype(s->getSequenceType(), Parameters::DBTYPE_AMINO_ACIDS));
-        const int xIndex = s->subMat->aa2int[(int)'X'];
         while (s->hasNextKmer()){
-            const int * kmer = s->nextKmer();
-            if(removeX){
-                int xCount = 0;
-                for(int pos = 0; pos < kmerSize; pos++){
-                    xCount += (kmer[pos] == xIndex);
-                }
-                if(xCount > 0){
-                    continue;
-                }
+            const unsigned char * kmer = s->nextKmer();
+            if(removeX && s->kmerContainsX()){
+                continue;
             }
             if(threshold > 0) {
                 int score = 0;
@@ -373,35 +365,39 @@ class IndexTable {
             if (offsets[kmerIdx + 1] - offsets[kmerIdx] == 0)
                 continue;
 
-            buffer[kmerPos].kmer = kmerIdx;
-            buffer[kmerPos].seqId      = s->getId();
-            buffer[kmerPos].position_j = s->getCurrentPosition();
+            (*buffer)[kmerPos].kmer = kmerIdx;
+            (*buffer)[kmerPos].seqId      = s->getId();
+            (*buffer)[kmerPos].position_j = s->getCurrentPosition();
             kmerPos++;
+            if(kmerPos >= bufferSize){
+                *buffer = static_cast<IndexEntryLocalTmp*>(realloc(*buffer, sizeof(IndexEntryLocalTmp) * bufferSize*2));
+                bufferSize = bufferSize*2;
+            }
         }
 
         if(kmerPos>1){
-            std::sort(buffer, buffer+kmerPos, IndexEntryLocalTmp::comapreByIdAndPos);
+            std::sort(*buffer, *buffer+kmerPos, IndexEntryLocalTmp::comapreByIdAndPos);
         }
 
         unsigned int prevKmer = UINT_MAX;
         for(size_t pos = 0; pos < kmerPos; pos++){
-            unsigned int kmerIdx = buffer[pos].kmer;
+            unsigned int kmerIdx = (*buffer)[pos].kmer;
             if(kmerIdx != prevKmer){
                 size_t offset = __sync_fetch_and_add(&(offsets[kmerIdx]), 1);
                 IndexEntryLocal *entry = &entries[offset];
-                entry->seqId      = buffer[pos].seqId;
-                entry->position_j = buffer[pos].position_j;
+                entry->seqId      = (*buffer)[pos].seqId;
+                entry->position_j = (*buffer)[pos].position_j;
             }
             prevKmer = kmerIdx;
         }
     }
 
     // prints the IndexTable
-    void print(char *int2aa) {
+    void print(char *num2aa) {
         for (size_t i = 0; i < tableSize; i++) {
             ptrdiff_t entrySize = offsets[i + 1] - offsets[i];
             if (entrySize > 0) {
-                indexer->printKmer(i, kmerSize, int2aa);
+                indexer->printKmer(i, kmerSize, num2aa);
 
                 Debug(Debug::INFO) << "\n";
                 IndexEntryLocal *e = &entries[offsets[i]];
diff --git a/src/prefiltering/Indexer.cpp b/src/prefiltering/Indexer.cpp
index 4823ee4..659c5c6 100644
--- a/src/prefiltering/Indexer.cpp
+++ b/src/prefiltering/Indexer.cpp
@@ -32,13 +32,13 @@ void Indexer::reset(){
     this->lastKmerIndex = this->maxKmerIndex;
 }
 
-void Indexer::printKmer(size_t kmerIdx, int kmerSize, char* int2aa){
+void Indexer::printKmer(size_t kmerIdx, int kmerSize, char* num2aa){
     index2int(workspace, kmerIdx, kmerSize);
     for (int j = 0; j < kmerSize; j++)
-        Debug(Debug::INFO) << int2aa[workspace[j]];
+        Debug(Debug::INFO) << num2aa[workspace[j]];
 }
 
-void Indexer::printKmer(const int* kmer, int kmerSize, char* int2aa){
+void Indexer::printKmer(const unsigned char* kmer, int kmerSize, char* num2aa){
     for (int j = 0; j < kmerSize; j++)
-        Debug(Debug::INFO) << int2aa[kmer[j]];
+        Debug(Debug::INFO) << num2aa[kmer[j]];
 }
\ No newline at end of file
diff --git a/src/prefiltering/Indexer.h b/src/prefiltering/Indexer.h
index 895ed8e..b78c5ca 100644
--- a/src/prefiltering/Indexer.h
+++ b/src/prefiltering/Indexer.h
@@ -17,7 +17,7 @@ class Indexer{
     ~Indexer();
     
     // get the index of the k-mer, beginning at "begin" in the int_seq and ending at "end"
-    size_t int2index( const int *int_seq,const int begin,const int end){
+    size_t int2index( const unsigned char *int_seq,const int begin,const int end){
         this->lastKmerIndex = 0;
         size_t res1, res2, res3, res4;
 
@@ -83,7 +83,7 @@ class Indexer{
     }
 
     // get the index of the k-mer of length maxKmerSize, beginning at position 0
-    size_t int2index( const int *int_seq){
+    size_t int2index( const unsigned char *int_seq){
         int2index(int_seq, 0, this->maxKmerSize);
         return this->lastKmerIndex;
     }
@@ -97,7 +97,7 @@ class Indexer{
     }
     
     // k-mer iterator, remembers the last k-mer
-    size_t getNextKmerIndex (const int* kmer, int kmerSize){
+    size_t getNextKmerIndex (const unsigned char* kmer, int kmerSize){
         if (this->lastKmerIndex == this->maxKmerIndex)
             return int2index(kmer, 0, kmerSize);
         else{
@@ -112,16 +112,16 @@ class Indexer{
     
     // print k amino acids of the k-mer with index kmerIdx
     // int k-mer is written into workspace
-    void printKmer(size_t kmerIdx, int kmerSize, char* int2aa);
+    void printKmer(size_t kmerIdx, int kmerSize, char* num2aa);
     
     // print k amino acids of int k-mer kmer
-    void printKmer(const int* kmer, int kmerSize, char* int2aa);
+    void printKmer(const unsigned char* kmer, int kmerSize, char* num2aa);
     
     size_t * powers;
     size_t * workspace;
 
 
-    static size_t computeKmerIdx(const int *kmer, size_t kmerSize) {
+    static size_t computeKmerIdx(const unsigned char *kmer, size_t kmerSize) {
         uint64_t kmerIdx = 0;
         for(size_t kmerPos = 0; kmerPos < kmerSize; kmerPos++){
             kmerIdx = kmerIdx << 2;
diff --git a/src/prefiltering/KmerGenerator.cpp b/src/prefiltering/KmerGenerator.cpp
index c34c93f..333ec99 100644
--- a/src/prefiltering/KmerGenerator.cpp
+++ b/src/prefiltering/KmerGenerator.cpp
@@ -107,7 +107,7 @@ void KmerGenerator::initDataStructure(){
 }
 
 
-std::pair<size_t *, size_t> KmerGenerator::generateKmerList(const int * int_seq, bool addIdentity){
+std::pair<size_t *, size_t> KmerGenerator::generateKmerList(const unsigned char * int_seq, bool addIdentity){
     int dividerBefore=0;
     // pre compute phase
     // find first threshold
diff --git a/src/prefiltering/KmerGenerator.h b/src/prefiltering/KmerGenerator.h
index 1551fac..fdddb68 100644
--- a/src/prefiltering/KmerGenerator.h
+++ b/src/prefiltering/KmerGenerator.h
@@ -13,7 +13,7 @@ class KmerGenerator
         KmerGenerator(size_t kmerSize,size_t alphabetSize, short threshold);
         ~KmerGenerator();
         /*calculates the kmer list */
-        std::pair<size_t *, size_t> generateKmerList(const int * intSeq, bool addIdentity = false);
+        std::pair<size_t *, size_t> generateKmerList(const unsigned char * intSeq, bool addIdentity = false);
 
         /* kmer splitting stragety (3,2)
          fill up the divide step and calls init_result_list */
diff --git a/src/prefiltering/Main.cpp b/src/prefiltering/Main.cpp
index 896e931..3655c7b 100644
--- a/src/prefiltering/Main.cpp
+++ b/src/prefiltering/Main.cpp
@@ -1,4 +1,3 @@
-
 #include "Prefiltering.h"
 #include "Util.h"
 #include "Parameters.h"
@@ -7,9 +6,6 @@
 #include "Timer.h"
 #include "FileUtil.h"
 
-#include <iostream>
-#include <string>
-
 #ifdef OPENMP
 #include <omp.h>
 #endif
diff --git a/src/prefiltering/Prefiltering.cpp b/src/prefiltering/Prefiltering.cpp
index 6f67162..1fee5e8 100644
--- a/src/prefiltering/Prefiltering.cpp
+++ b/src/prefiltering/Prefiltering.cpp
@@ -3,6 +3,8 @@
 #include "ReducedMatrix.h"
 #include "ExtendedSubstitutionMatrix.h"
 #include "SubstitutionMatrixProfileStates.h"
+#include "DBWriter.h"
+
 #include "PatternCompiler.h"
 #include "FileUtil.h"
 #include "IndexBuilder.h"
@@ -10,10 +12,7 @@
 #include "ByteParser.h"
 #include "Parameters.h"
 #include "MemoryMapped.h"
-
-namespace prefilter {
-#include "ExpOpt3_8_polished.cs32.lib.h"
-}
+#include <sys/mman.h>
 
 #ifdef OPENMP
 #include <omp.h>
@@ -34,7 +33,6 @@ Prefiltering::Prefiltering(const std::string &queryDB,
         spacedKmerPattern(par.spacedKmerPattern),
         localTmp(par.localTmp),
         spacedKmer(par.spacedKmer != 0),
-        alphabetSize(par.alphabetSize),
         maskMode(par.maskMode),
         maskLowerCaseMode(par.maskLowerCaseMode),
         splitMode(par.splitMode),
@@ -57,23 +55,24 @@ Prefiltering::Prefiltering(const std::string &queryDB,
     // init the substitution matrices
     switch (querySeqType & 0x7FFFFFFF) {
         case Parameters::DBTYPE_NUCLEOTIDES:
-            kmerSubMat = getSubstitutionMatrix(scoringMatrixFile, alphabetSize, 1.0, false, true);
+            kmerSubMat = getSubstitutionMatrix(scoringMatrixFile, par.alphabetSize, 1.0, false, true);
             ungappedSubMat = kmerSubMat;
             alphabetSize = kmerSubMat->alphabetSize;
             break;
         case Parameters::DBTYPE_AMINO_ACIDS:
-            kmerSubMat = getSubstitutionMatrix(seedScoringMatrixFile, alphabetSize, 8.0, false, false);
-            ungappedSubMat = getSubstitutionMatrix(scoringMatrixFile, alphabetSize, 2.0, false, false);
+            kmerSubMat = getSubstitutionMatrix(seedScoringMatrixFile, par.alphabetSize, 8.0, false, false);
+            ungappedSubMat = getSubstitutionMatrix(scoringMatrixFile, par.alphabetSize, 2.0, false, false);
             alphabetSize = kmerSubMat->alphabetSize;
             break;
         case Parameters::DBTYPE_HMM_PROFILE:
             // needed for Background distributions
-            kmerSubMat = getSubstitutionMatrix(scoringMatrixFile, alphabetSize, 8.0, false, false);
-            ungappedSubMat = getSubstitutionMatrix(scoringMatrixFile, alphabetSize, 2.0, false, false);
+            kmerSubMat = getSubstitutionMatrix(scoringMatrixFile, par.alphabetSize, 8.0, false, false);
+            ungappedSubMat = getSubstitutionMatrix(scoringMatrixFile, par.alphabetSize, 2.0, false, false);
+            alphabetSize = kmerSubMat->alphabetSize;
             break;
         case Parameters::DBTYPE_PROFILE_STATE_PROFILE:
-            kmerSubMat = getSubstitutionMatrix(scoringMatrixFile, alphabetSize, 8.0, true, false);
-            ungappedSubMat = getSubstitutionMatrix(scoringMatrixFile, alphabetSize, 2.0, false, false);
+            kmerSubMat = getSubstitutionMatrix(scoringMatrixFile, par.alphabetSize, 8.0, true, false);
+            ungappedSubMat = getSubstitutionMatrix(scoringMatrixFile, par.alphabetSize, 2.0, false, false);
             alphabetSize = kmerSubMat->alphabetSize;
             break;
         default:
@@ -154,7 +153,7 @@ Prefiltering::Prefiltering(const std::string &queryDB,
             }
             spacedKmer = data.spacedKmer != 0;
             spacedKmerPattern = PrefilteringIndexReader::getSpacedPattern(tidxdbr);
-            seedScoringMatrixFile = ScoreMatrixFile(PrefilteringIndexReader::getSubstitutionMatrix(tidxdbr));
+            seedScoringMatrixFile = MultiParam<char*>(PrefilteringIndexReader::getSubstitutionMatrix(tidxdbr));
         } else {
             Debug(Debug::ERROR) << "Outdated index version. Please recompute it with 'createindex'!\n";
             EXIT(EXIT_FAILURE);
@@ -260,27 +259,6 @@ Prefiltering::~Prefiltering() {
     delete kmerSubMat;
 }
 
-void Prefiltering::reopenTargetDb() {
-    if (templateDBIsIndex == true) {
-        tidxdbr->close();
-        delete tidxdbr;
-        tidxdbr = NULL;
-    }
-
-    tdbr->close();
-    delete tdbr;
-
-    Debug(Debug::INFO) << "Index table not compatible with chosen settings. Compute index.\n";
-    tdbr = new DBReader<unsigned int>(targetDB.c_str(), targetDBIndex.c_str(), threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
-    tdbr->open(DBReader<unsigned int>::NOSORT);
-    if (preloadMode != Parameters::PRELOAD_MODE_MMAP) {
-        tdbr->readMmapedDataInMemory();
-        tdbr->mlock();
-    }
-
-    templateDBIsIndex = false;
-}
-
 void Prefiltering::setupSplit(DBReader<unsigned int>& tdbr, const int alphabetSize, const unsigned int querySeqTyp, const int threads,
                               const bool templateDBIsIndex, const size_t memoryLimit, const size_t qDbSize,
                               size_t &maxResListLen, int &kmerSize, int &split, int &splitMode) {
@@ -424,70 +402,54 @@ void Prefiltering::mergeTargetSplits(const std::string &outDB, const std::string
     }
     reader1.setDataSize(totalSize);
 
-    size_t *starts = new size_t[threads];
-    size_t *lengths = new size_t[threads];
-    size_t **offsetStart = new size_t*[threads];
-    for (size_t i = 0; i < threads; ++i) {
-        reader1.decomposeDomainByAminoAcid(i, threads, &starts[i], &lengths[i]);
-        offsetStart[i] = new size_t[splits];
-    }
-
-    for (size_t s = 0; s < splits; ++s) {
-        DBReader<unsigned int> reader2(fileNames[s].first.c_str(), fileNames[s].second.c_str(), 1, DBReader<unsigned int>::USE_INDEX);
-        reader2.open(DBReader<unsigned int>::NOSORT);
-        DBReader<unsigned int>::Index *index2 = reader2.getIndex();
-        for (size_t t = 0; t < threads; t++) {
-            offsetStart[t][s] = index2[starts[t]].offset;
+    FILE ** files = new FILE*[fileNames.size()];
+    char ** dataFile = new char*[fileNames.size()];
+    size_t * dataFileSize = new size_t[fileNames.size()];
+    size_t globalIdOffset = 0;
+    for (size_t i = 0; i < splits; ++i) {
+        files[i] = FileUtil::openFileOrDie(fileNames[i].first.c_str(), "r", true);
+        dataFile[i] = static_cast<char*>(FileUtil::mmapFile(files[i], &dataFileSize[i]));
+#ifdef HAVE_POSIX_MADVISE
+        if (posix_madvise (dataFile[i], dataFileSize[i], POSIX_MADV_SEQUENTIAL) != 0){
+            Debug(Debug::ERROR) << "posix_madvise returned an error " << fileNames[i].first << "\n";
         }
-        reader2.close();
-    }
+#endif
 
+    }
     Debug(Debug::INFO) << "Preparing offsets for merging: " << timer.lap() << "\n";
     // merge target splits data files and sort the hits at the same time
     // TODO: compressed?
     DBWriter writer(outDB.c_str(), outDBIndex.c_str(), threads, 0, Parameters::DBTYPE_PREFILTER_RES);
     writer.open();
 
-    Debug::Progress pregress(reader1.getSize());
+    Debug::Progress progress(reader1.getSize());
 #pragma omp parallel num_threads(threads)
     {
         unsigned int thread_idx = 0;
 #ifdef OPENMP
         thread_idx = static_cast<unsigned int>(omp_get_thread_num());
 #endif
-
         std::string result;
         result.reserve(1024);
-
         std::vector<hit_t> hits;
         hits.reserve(300);
-
         char buffer[1024];
-
-        size_t id = starts[thread_idx];
-        size_t lastId = id + lengths[thread_idx];
-        FILE** files = new FILE*[splits];
-        for (size_t i = 0; i < splits; ++i) {
-            files[i] = fopen(fileNames[i].first.c_str(), "rb");
-            fseek(files[i], offsetStart[thread_idx][i], SEEK_SET);
-        }
-
-        while (id < lastId) {
-            pregress.updateProgress();
-            for (size_t i = 0; i < splits; ++i) {
-                int c1 = EOF;
-                size_t pos = 0;
-                while ((c1 = getc_unlocked(files[i])) != EOF) {
-                    buffer[pos++] = (char)c1;
-                    if (c1 == '\n') {
-                        hits.emplace_back(QueryMatcher::parsePrefilterHit(buffer));
-                        pos = 0;
-                    } else if (c1 == '\0') {
-                        break;
-                    }
+        size_t * currentDataFileOffset = new size_t[splits];
+        memset(currentDataFileOffset, 0, sizeof(size_t)*splits);
+        size_t currentId = __sync_fetch_and_add(&(globalIdOffset), 1);
+        size_t prevId = 0;
+        while(currentId < reader1.getSize()){
+            progress.updateProgress();
+            for(size_t file = 0; file < splits; file++){
+                size_t tmpId = prevId;
+                size_t pos;
+                for(pos = currentDataFileOffset[file]; pos < dataFileSize[file] && tmpId != currentId; pos++){
+                    tmpId += (dataFile[file][pos] == '\0');
+                    currentDataFileOffset[file] = pos;
                 }
+                currentDataFileOffset[file] = pos;
+                QueryMatcher::parsePrefilterHits(&dataFile[file][pos], hits);
             }
-
             if (hits.size() > 1) {
                 std::sort(hits.begin(), hits.end(), hit_t::compareHitsByScoreAndId);
             }
@@ -495,27 +457,26 @@ void Prefiltering::mergeTargetSplits(const std::string &outDB, const std::string
                 int len = QueryMatcher::prefilterHitToBuffer(buffer, hits[i]);
                 result.append(buffer, len);
             }
-            writer.writeData(result.c_str(), result.size(), reader1.getDbKey(id), thread_idx);
+            writer.writeData(result.c_str(), result.size(), reader1.getDbKey(currentId), thread_idx);
             hits.clear();
             result.clear();
-            id++;
+            prevId = currentId;
+            currentId = __sync_fetch_and_add(&(globalIdOffset), 1);
         }
 
-        for (size_t i = 0; i < splits; ++i) {
-            fclose(files[i]);
-        }
-        delete[] files;
-        delete[] offsetStart[thread_idx];
+            delete[] currentDataFileOffset;
     }
     writer.close();
     reader1.close();
 
     for (size_t i = 0; i < splits; ++i) {
         DBReader<unsigned int>::removeDb(fileNames[i].first);
+        FileUtil::munmapData(dataFile[i], dataFileSize[i]);
+        fclose(files[i]);
     }
-    delete[] offsetStart;
-    delete[] lengths;
-    delete[] starts;
+    delete [] dataFile;
+    delete [] dataFileSize;
+    delete [] files;
 
     Debug(Debug::INFO) << "Time for merging target splits: " << timer.lap() << "\n";
 }
@@ -570,7 +531,7 @@ void Prefiltering::getIndexTable(int split, size_t dbFrom, size_t dbSize) {
             sequenceLookup = NULL;
         }
 
-        indexTable->printStatistics(kmerSubMat->int2aa);
+        indexTable->printStatistics(kmerSubMat->num2aa);
         tdbr->remapData();
         Debug(Debug::INFO) << "Time for index table init: " << timer.lap() << "\n";
     }
@@ -783,6 +744,7 @@ bool Prefiltering::runSplit(const std::string &resultDB, const std::string &resu
     size_t resSize = 0;
     size_t realResSize = 0;
     size_t diagonalOverflow = 0;
+    size_t trancatedCounter = 0;
     size_t totalQueryDBSize = querySize;
 
     unsigned int localThreads = 1;
@@ -813,10 +775,9 @@ bool Prefiltering::runSplit(const std::string &resultDB, const std::string &resu
 #ifdef OPENMP
         thread_idx = static_cast<unsigned int>(omp_get_thread_num());
 #endif
-        Sequence seq(maxSeqLen, querySeqType, kmerSubMat, kmerSize, spacedKmer, aaBiasCorrection, true, spacedKmerPattern);
-
+        Sequence seq(qdbr->getMaxSeqLen(), querySeqType, kmerSubMat, kmerSize, spacedKmer, aaBiasCorrection, true, spacedKmerPattern);
         QueryMatcher matcher(indexTable, sequenceLookup, kmerSubMat,  ungappedSubMat,
-                             kmerThr, kmerSize, dbSize, maxSeqLen, maxResListLen, aaBiasCorrection,
+                             kmerThr, kmerSize, dbSize, std::max(tdbr->getMaxSeqLen(),qdbr->getMaxSeqLen()), maxResListLen, aaBiasCorrection,
                              diagonalScoring, minDiagScoreThr, takeOnlyBestKmer);
 
         if (seq.profile_matrix != NULL) {
@@ -831,7 +792,7 @@ bool Prefiltering::runSplit(const std::string &resultDB, const std::string &resu
         std::string result;
         result.reserve(1000000);
 
-#pragma omp for schedule(dynamic, 2) reduction (+: kmersPerPos, resSize, dbMatches, doubleMatches, querySeqLenSum, diagonalOverflow)
+#pragma omp for schedule(dynamic, 2) reduction (+: kmersPerPos, resSize, dbMatches, doubleMatches, querySeqLenSum, diagonalOverflow, trancatedCounter)
         for (size_t id = queryFrom; id < queryFrom + querySize; id++) {
             progress.updateProgress();
             // get query sequence
@@ -894,6 +855,7 @@ bool Prefiltering::runSplit(const std::string &resultDB, const std::string &resu
                 doubleMatches += matcher.getStatistics()->doubleMatches;
                 querySeqLenSum += seq.L;
                 diagonalOverflow += matcher.getStatistics()->diagonalOverflow;
+                trancatedCounter += matcher.getStatistics()->truncated;
                 resSize += resultSize;
                 realResSize += std::min(resultSize, maxResListLen);
                 reslens[thread_idx]->emplace_back(resultSize);
@@ -906,7 +868,7 @@ bool Prefiltering::runSplit(const std::string &resultDB, const std::string &resu
                            dbMatches / totalQueryDBSize,
                            doubleMatches / totalQueryDBSize,
                            querySeqLenSum, diagonalOverflow,
-                           resSize / totalQueryDBSize);
+                           resSize / totalQueryDBSize, trancatedCounter);
 
         size_t empty = 0;
         for (size_t id = 0; id < querySize; id++) {
@@ -933,11 +895,15 @@ bool Prefiltering::runSplit(const std::string &resultDB, const std::string &resu
     // needed to speed up merge later on
     // sorts this datafile according to the index file
     if (splitMode == Parameters::TARGET_DB_SPLIT && splits > 1) {
-        // delete indexTable to free memory:
+        // free memory early since the merge might need quite a bit of memory
         if (indexTable != NULL) {
             delete indexTable;
             indexTable = NULL;
         }
+        if (sequenceLookup != NULL) {
+            delete sequenceLookup;
+            sequenceLookup = NULL;
+        }
         DBReader<unsigned int> resultReader(tmpDbw.getDataFileName(), tmpDbw.getIndexFileName(), threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
         resultReader.open(DBReader<unsigned int>::NOSORT);
         resultReader.readMmapedDataInMemory();
@@ -972,6 +938,7 @@ void Prefiltering::printStatistics(const statistics_t &stats, std::list<int> **r
     Debug(Debug::INFO) << "\n" << stats.kmersPerPos << " k-mers per position\n";
     Debug(Debug::INFO) << stats.dbMatches << " DB matches per sequence\n";
     Debug(Debug::INFO) << stats.diagonalOverflow << " overflows\n";
+    Debug(Debug::INFO) << stats.truncated << " queries produce too much hits (truncated result)\n";
     Debug(Debug::INFO) << stats.resultsPassedPrefPerSeq << " sequences passed prefiltering per query sequence";
     if (stats.resultsPassedPrefPerSeq > maxResults)
         Debug(Debug::WARNING) << " (ATTENTION: max. " << maxResults
@@ -986,14 +953,14 @@ void Prefiltering::printStatistics(const statistics_t &stats, std::list<int> **r
 }
 
 
-BaseMatrix *Prefiltering::getSubstitutionMatrix(const ScoreMatrixFile &scoringMatrixFile, size_t alphabetSize, float bitFactor, bool profileState, bool isNucl) {
+BaseMatrix *Prefiltering::getSubstitutionMatrix(const MultiParam<char*> &scoringMatrixFile, MultiParam<int> alphabetSize, float bitFactor, bool profileState, bool isNucl) {
     BaseMatrix *subMat;
 
     if (isNucl){
         subMat = new NucleotideMatrix(scoringMatrixFile.nucleotides, bitFactor, 0.0);
-    } else if (alphabetSize < 21) {
+    } else if (alphabetSize.aminoacids < 21) {
         SubstitutionMatrix sMat(scoringMatrixFile.aminoacids, bitFactor, -0.2f);
-        subMat = new ReducedMatrix(sMat.probMatrix, sMat.subMatrixPseudoCounts, sMat.aa2int, sMat.int2aa, sMat.alphabetSize, alphabetSize, bitFactor);
+        subMat = new ReducedMatrix(sMat.probMatrix, sMat.subMatrixPseudoCounts, sMat.aa2num, sMat.num2aa, sMat.alphabetSize, alphabetSize.aminoacids, bitFactor);
     }else if(profileState == true){
         SubstitutionMatrix sMat(scoringMatrixFile.aminoacids, bitFactor, -0.2f);
         subMat = new SubstitutionMatrixProfileStates(sMat.matrixName, sMat.probMatrix, sMat.pBack,
@@ -1098,7 +1065,7 @@ std::pair<int, int> Prefiltering::optimizeSplit(size_t totalMemoryInByte, DBRead
         endKmerSize   = (externalKmerSize == 0) ? 15 : externalKmerSize;
     }
 
-    for (int optKmerSize = startKmerSize; optKmerSize <= endKmerSize ; optKmerSize++) {
+    for (int optKmerSize = endKmerSize; optKmerSize >= startKmerSize ; optKmerSize--) {
         size_t aaUpperBoundForKmerSize = (SIZE_MAX - 1);
         if(externalKmerSize == 0){
             if(Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) {
diff --git a/src/prefiltering/Prefiltering.h b/src/prefiltering/Prefiltering.h
index be599f8..d6b8729 100644
--- a/src/prefiltering/Prefiltering.h
+++ b/src/prefiltering/Prefiltering.h
@@ -3,7 +3,6 @@
 
 #include "Parameters.h"
 #include "DBReader.h"
-#include "DBWriter.h"
 #include "IndexTable.h"
 #include "BaseMatrix.h"
 #include "ScoreMatrix.h"
@@ -14,7 +13,6 @@
 #include <list>
 #include <utility>
 
-
 class Prefiltering {
 public:
     Prefiltering(
@@ -40,7 +38,7 @@ class Prefiltering {
                     const std::vector<std::pair<std::string, std::string>> &splitFiles);
 
     // get substitution matrix
-    static BaseMatrix *getSubstitutionMatrix(const ScoreMatrixFile &scoringMatrixFile, size_t alphabetSize, float bitFactor, bool profileState, bool isNucl);
+    static BaseMatrix *getSubstitutionMatrix(const MultiParam<char*> &scoringMatrixFile, MultiParam<int> alphabetSize, float bitFactor, bool profileState, bool isNucl);
 
     static void setupSplit(DBReader<unsigned int>& dbr, const int alphabetSize, const unsigned int querySeqType, const int threads,
                            const bool templateDBIsIndex, const size_t memoryLimit, const size_t qDbSize,
@@ -80,8 +78,8 @@ class Prefiltering {
     int maskLowerCaseMode;
     int splitMode;
     int kmerThr;
-    ScoreMatrixFile scoringMatrixFile;
-    ScoreMatrixFile seedScoringMatrixFile;
+    MultiParam<char*> scoringMatrixFile;
+    MultiParam<char*>  seedScoringMatrixFile;
     int targetSeqType;
     bool takeOnlyBestKmer;
     size_t maxResListLen;
@@ -124,8 +122,6 @@ class Prefiltering {
                          unsigned int resLensSize, size_t empty, size_t maxResults);
 
     bool isSameQTDB();
-
-    void reopenTargetDb();
 };
 
 #endif
diff --git a/src/prefiltering/PrefilteringIndexReader.cpp b/src/prefiltering/PrefilteringIndexReader.cpp
index 6fa2965..0bdc038 100644
--- a/src/prefiltering/PrefilteringIndexReader.cpp
+++ b/src/prefiltering/PrefilteringIndexReader.cpp
@@ -202,7 +202,7 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
                                    (maskMode == 1 || maskLowerCase == 1) ? &sequenceLookup : NULL,
                                    (maskMode == 0 ) ? &sequenceLookup : NULL,
                                    *subMat, &seq, dbr1, dbFrom, dbFrom + dbSize, kmerThr, maskMode, maskLowerCase);
-        indexTable.printStatistics(subMat->int2aa);
+        indexTable.printStatistics(subMat->num2aa);
 
         if (sequenceLookup == NULL) {
             Debug(Debug::ERROR) << "Invalid mask mode. No sequence lookup created!\n";
@@ -547,4 +547,19 @@ std::string PrefilteringIndexReader::searchForIndex(const std::string &pathToDB)
     return "";
 }
 
+std::string PrefilteringIndexReader::dbPathWithoutIndex(std::string & dbname) {
+    std::string rawname = dbname;
+    // check for .idx
+    size_t idxlastpos = dbname.rfind(".idx");
+    if(idxlastpos != std::string::npos && dbname.size() - idxlastpos == 4){
+        rawname  = dbname.substr(0, idxlastpos);
+    }
+    // check for .linidx
+    size_t linidxlastpos = dbname.rfind(".linidx");
+    if(linidxlastpos != std::string::npos && dbname.size() - linidxlastpos == 7){
+        rawname  = dbname.substr(0, linidxlastpos);
+    }
+    return rawname;
+}
+
 
diff --git a/src/prefiltering/PrefilteringIndexReader.h b/src/prefiltering/PrefilteringIndexReader.h
index c9983f1..31c92ea 100644
--- a/src/prefiltering/PrefilteringIndexReader.h
+++ b/src/prefiltering/PrefilteringIndexReader.h
@@ -83,6 +83,8 @@ class PrefilteringIndexReader {
 
     static std::string searchForIndex(const std::string &pathToDB);
 
+    static std::string dbPathWithoutIndex(std::string &dbname);
+
 private:
     static void printMeta(int *meta);
 };
diff --git a/src/prefiltering/QueryMatcher.cpp b/src/prefiltering/QueryMatcher.cpp
index e20d567..b47c68e 100644
--- a/src/prefiltering/QueryMatcher.cpp
+++ b/src/prefiltering/QueryMatcher.cpp
@@ -1,9 +1,3 @@
-//
-// Created by mad on 5/26/15.
-//
-#include <new>
-#include <iomanip>
-
 #include "SubstitutionMatrix.h"
 #include "QueryMatcher.h"
 #include "Util.h"
@@ -43,14 +37,14 @@ QueryMatcher::QueryMatcher(IndexTable *indexTable, SequenceLookup *sequenceLooku
     // assure that the whole database can be matched (extreme case)
     // this array will need 500 MB for 50 Mio. sequences ( dbSize * 2 * 5byte)
     this->dbSize = dbSize;
-    this->counterResultSize = std::max((size_t)1000000, dbSize);
+    this->foundDiagonalsSize = std::max((size_t)1000000, dbSize);
     this->maxDbMatches = std::max((size_t)1000000, dbSize) * 2;
     // we can never find more hits than dbSize
     this->maxHitsPerQuery = std::min(maxHitsPerQuery, dbSize);
     this->resList = (hit_t *) mem_align(ALIGN_INT, maxHitsPerQuery * sizeof(hit_t) );
     this->databaseHits = new(std::nothrow) IndexEntryLocal[maxDbMatches];
     Util::checkAllocation(databaseHits, "Can not allocate databaseHits memory in QueryMatcher");
-    this->foundDiagonals = (CounterResult*)calloc(counterResultSize, sizeof(CounterResult));
+    this->foundDiagonals = (CounterResult*)calloc(foundDiagonalsSize, sizeof(CounterResult));
     Util::checkAllocation(foundDiagonals, "Can not allocate foundDiagonals memory in QueryMatcher");
     this->lastSequenceHit = this->databaseHits + maxDbMatches;
     this->indexPointer = new(std::nothrow) IndexEntryLocal*[maxSeqLen + 1];
@@ -74,11 +68,11 @@ QueryMatcher::QueryMatcher(IndexTable *indexTable, SequenceLookup *sequenceLooku
 QueryMatcher::~QueryMatcher(){
     deleteDiagonalMatcher(activeCounter);
     free(resList);
-    delete [] scoreSizes;
-    delete [] databaseHits;
-    delete [] indexPointer;
+    delete[] scoreSizes;
+    delete[] databaseHits;
+    delete[] indexPointer;
     free(foundDiagonals);
-    delete [] compositionBias;
+    delete[] compositionBias;
     if(ungappedAlignment != NULL){
         delete ungappedAlignment;
     }
@@ -86,22 +80,7 @@ QueryMatcher::~QueryMatcher(){
     delete kmerGenerator;
 }
 
-size_t QueryMatcher::evaluateBins(IndexEntryLocal **hitsByIndex,
-                                  CounterResult *output,
-                                  size_t outputSize,
-                                  unsigned short indexFrom,
-                                  unsigned short indexTo,
-                                  bool computeTotalScore) {
-    size_t localResultSize = 0;
-#define COUNT_CASE(x) case x: localResultSize += cachedOperation##x->countElements(hitsByIndex, output, outputSize, indexFrom, indexTo, computeTotalScore); break;
-    switch (activeCounter){
-        FOR_EACH(COUNT_CASE,2,4,8,16,32,64,128,256,512,1024,2048)
-    }
-#undef COUNT_CASE
-    return localResultSize;
-}
-
-std::pair<hit_t *, size_t> QueryMatcher::matchQuery (Sequence * querySeq, unsigned int identityId){
+std::pair<hit_t*, size_t> QueryMatcher::matchQuery(Sequence *querySeq, unsigned int identityId) {
     querySeq->resetCurrPos();
 //    std::cout << "Id: " << querySeq->getId() << std::endl;
     memset(scoreSizes, 0, SCORE_RANGE * sizeof(unsigned int));
@@ -109,7 +88,7 @@ std::pair<hit_t *, size_t> QueryMatcher::matchQuery (Sequence * querySeq, unsign
     // bias correction
     if(aaBiasCorrection == true){
         if(Parameters::isEqualDbtype(querySeq->getSeqType(), Parameters::DBTYPE_AMINO_ACIDS)) {
-            SubstitutionMatrix::calcLocalAaBiasCorrection(kmerSubMat, querySeq->int_sequence, querySeq->L, compositionBias);
+            SubstitutionMatrix::calcLocalAaBiasCorrection(kmerSubMat, querySeq->numSequence, querySeq->L, compositionBias);
         }else{
             memset(compositionBias, 0, sizeof(float) * querySeq->L);
         }
@@ -118,22 +97,20 @@ std::pair<hit_t *, size_t> QueryMatcher::matchQuery (Sequence * querySeq, unsign
     }
 
     size_t resultSize = match(querySeq, compositionBias);
-    std::pair<hit_t *, size_t > queryResult;
+    std::pair<hit_t *, size_t> queryResult;
     if (diagonalScoring) {
         // write diagonal scores in count value
         ungappedAlignment->processQuery(querySeq, compositionBias, foundDiagonals, resultSize);
         memset(scoreSizes, 0, SCORE_RANGE * sizeof(unsigned int));
 
-
         resultSize = keepMaxScoreElementOnly(foundDiagonals, resultSize);
 
         updateScoreBins(foundDiagonals, resultSize);
         unsigned int diagonalThr = computeScoreThreshold(scoreSizes, this->maxHitsPerQuery);
         diagonalThr = std::max(minDiagScoreThr, diagonalThr);
 
-
         // sort to not lose highest scoring hits if > 150.000 hits are searched
-        if(resultSize < counterResultSize/2){
+        if(resultSize < foundDiagonalsSize / 2){
             unsigned int maxDiagonalScoreThr = (UCHAR_MAX - ungappedAlignment->getQueryBias());
             bool scoreIsTruncated = (diagonalThr >= maxDiagonalScoreThr) ? true : false;
             size_t elementsCntAboveDiagonalThr = radixSortByScoreSize(scoreSizes, foundDiagonals + resultSize, diagonalThr, foundDiagonals, resultSize);
@@ -147,19 +124,23 @@ std::pair<hit_t *, size_t> QueryMatcher::matchQuery (Sequence * querySeq, unsign
             }else{
                 queryResult = getResult<UNGAPPED_DIAGONAL_SCORE>(foundDiagonals + resultSize, elementsCntAboveDiagonalThr, identityId, diagonalThr, ungappedAlignment, false);
             }
+            stats->truncated = 0;
         }else{
-            Debug(Debug::WARNING) << "Sequence " << querySeq->getDbKey() << " produces too many hits. Results might be truncated\n";
+            //Debug(Debug::WARNING) << "Sequence " << querySeq->getDbKey() << " produces too many hits. Results might be truncated\n";
             queryResult = getResult<UNGAPPED_DIAGONAL_SCORE>(foundDiagonals, resultSize, identityId, diagonalThr, ungappedAlignment, false);
+            stats->truncated = 1;
         }
     }else{
         unsigned int thr = computeScoreThreshold(scoreSizes, this->maxHitsPerQuery);
         thr = std::max(minDiagScoreThr, thr);
-        if(resultSize < counterResultSize/2) {
+        if(resultSize < foundDiagonalsSize / 2) {
             int elementsCntAboveDiagonalThr = radixSortByScoreSize(scoreSizes, foundDiagonals + resultSize, thr, foundDiagonals, resultSize);
             queryResult = getResult<KMER_SCORE>(foundDiagonals + resultSize, elementsCntAboveDiagonalThr, identityId, thr, ungappedAlignment, false);
+            stats->truncated = 0;
         }else{
-            Debug(Debug::WARNING) << "Sequence " << querySeq->getDbKey() << " produces too many hits. Results might be truncated\n";
+//            Debug(Debug::WARNING) << "Sequence " << querySeq->getDbKey() << " produces too many hits. Results might be truncated\n";
             queryResult = getResult<KMER_SCORE>(foundDiagonals, resultSize, identityId, thr, ungappedAlignment, false);
+            stats->truncated = 1;
         }
     }
     if(queryResult.second > 1){
@@ -184,20 +165,16 @@ size_t QueryMatcher::match(Sequence *seq, float *compositionBias) {
     size_t seqListSize;
     unsigned short indexStart = 0;
     unsigned short indexTo = 0;
-    const int xIndex = kmerSubMat->aa2int[(int)'X'];
-
-    while(seq->hasNextKmer()){
-        const int * kmer = seq->nextKmer();
-        const unsigned char * pos = seq->getAAPosInSpacedPattern();
+    while (seq->hasNextKmer()) {
+        const unsigned char *kmer = seq->nextKmer();
+        const unsigned char *pos = seq->getAAPosInSpacedPattern();
         const unsigned short current_i = seq->getCurrentPosition();
 
         float biasCorrection = 0;
-        int xCount = 0;
         for (int i = 0; i < kmerSize; i++){
-            xCount += (kmer[i] == xIndex);
             biasCorrection += compositionBias[current_i + static_cast<short>(pos[i])];
         }
-        if(xCount > 0){
+        if (seq->kmerContainsX()) {
             indexTo = current_i;
             indexPointer[current_i] = sequenceHits;
             continue;
@@ -209,14 +186,14 @@ size_t QueryMatcher::match(Sequence *seq, float *compositionBias) {
         // adjust kmer threshold based on composition bias
         kmerGenerator->setThreshold(kmerMatchScore);
 
-        const size_t * index;
+        const size_t *index;
         size_t exactKmer;
         size_t kmerElementSize;
-        if(takeOnlyBestKmer){
+        if (takeOnlyBestKmer) {
             kmerElementSize = 1;
             exactKmer = idx.int2index(kmer);
             index = &exactKmer;
-        }else{
+        } else {
             std::pair<size_t*, size_t> kmerList = kmerGenerator->generateKmerList(kmer);
             kmerElementSize = kmerList.second;
             index = kmerList.first;
@@ -225,54 +202,50 @@ size_t QueryMatcher::match(Sequence *seq, float *compositionBias) {
         indexPointer[current_i] = sequenceHits;
         // match the index table
 
-        //idx.printKmer(kmerList.index[0], kmerSize, m->int2aa);
-        //std::cout  << "\t" << kmerMatchScore << std::endl;
+        //idx.printKmer(kmerList.index[0], kmerSize, m->num2aa);
+        //std::cout << "\t" << kmerMatchScore << std::endl;
         kmerListLen += kmerElementSize;
 
         for (unsigned int kmerPos = 0; kmerPos < kmerElementSize; kmerPos++) {
-            // generate k-mer list
-//                        idx.printKmer(index[kmerPos], kmerSize, m->int2aa);
-//                        std::cout << std::endl;
-
             const IndexEntryLocal *entries = indexTable->getDBSeqList(index[kmerPos], &seqListSize);
+            // DEBUG
+            //std::cout << seq->getDbKey() << std::endl;
+            //idx.printKmer(index[kmerPos], kmerSize, kmerSubMat->num2aa);
+            //std::cout << "\t" << current_i << "\t"<< index[kmerPos] << std::endl;
+            //for (size_t i = 0; i < seqListSize; i++) {
+            //    char diag = entries[i].position_j - current_i;
+            //    std::cout << "(" << entries[i].seqId << " " << (int) diag << ")\t";
+            //}
+            //std::cout << std::endl;
 
-            /////DEBUG
-           /* 
-            idx.printKmer(index[kmerPos], kmerSize, m->int2aa);
-            std::cout << "\t" << current_i << "\t"<< index[kmerPos] << std::endl;
-            for(size_t i = 0; i < seqListSize; i++){
-                char diag = entries[i].position_j - current_i;
-                std::cout << "(" << entries[i].seqId << " " << (int) diag << ")\t";
-            }
-            std::cout << std::endl;
-            */
-            /////DEBUG
             // detected overflow while matching
             if ((sequenceHits + seqListSize) >= lastSequenceHit) {
                 stats->diagonalOverflow = true;
                 // last pointer
                 indexPointer[current_i + 1] = sequenceHits;
-//                std::cout << "Overflow in i=" << indexStart << std::endl;
-                const size_t hitCount = evaluateBins(indexPointer,
-                                                     foundDiagonals + overflowHitCount,
-                                                     counterResultSize - overflowHitCount,
-                                                     indexStart, current_i, (diagonalScoring == false));
-                if(overflowHitCount != 0){ //merge lists
-                    // hitCount is max. dbSize so there can be no overflow in mergeElemens
-                    overflowHitCount = mergeElements(foundDiagonals, overflowHitCount +  hitCount);
+                //std::cout << "Overflow in i=" << indexStart << std::endl;
+                const size_t hitCount = findDuplicates(indexPointer,
+                                                       foundDiagonals + overflowHitCount,
+                                                       foundDiagonalsSize - overflowHitCount,
+                                                       indexStart, current_i, (diagonalScoring == false));
+
+                if (overflowHitCount != 0) {
+                    // merge lists, hitCount is max. dbSize so there can be no overflow in mergeElements
+                    overflowHitCount = mergeElements(foundDiagonals, hitCount + overflowHitCount);
                 } else {
                     overflowHitCount = hitCount;
                 }
                 // reset pointer position
                 sequenceHits = databaseHits;
-                indexPointer[current_i] = databaseHits;
+                indexPointer[current_i] = sequenceHits;
                 indexStart = current_i;
                 overflowNumMatches += numMatches;
                 numMatches = 0;
-                if((sequenceHits + seqListSize) >= lastSequenceHit){
+                // TODO might delete this?
+                if ((sequenceHits + seqListSize) >= lastSequenceHit){
                     goto outer;
                 }
-            };
+            }
             memcpy(sequenceHits, entries, sizeof(IndexEntryLocal) * seqListSize);
             sequenceHits += seqListSize;
             numMatches += seqListSize;
@@ -281,10 +254,11 @@ size_t QueryMatcher::match(Sequence *seq, float *compositionBias) {
     }
     outer:
     indexPointer[indexTo + 1] = databaseHits + numMatches;
-    size_t hitCount = evaluateBins(indexPointer, foundDiagonals + overflowHitCount,
-                                   counterResultSize - overflowHitCount, indexStart, indexTo, (diagonalScoring == false));
-    //fill the output
-    if(overflowHitCount != 0){ // overflow occurred
+    // fill the output
+    size_t hitCount = findDuplicates(indexPointer, foundDiagonals + overflowHitCount,
+                                     foundDiagonalsSize - overflowHitCount, indexStart, indexTo, (diagonalScoring == false));
+    if (overflowHitCount != 0) {
+        // overflow occurred
         hitCount = mergeElements(foundDiagonals, overflowHitCount + hitCount);
     }
     stats->doubleMatches = 0;
@@ -293,9 +267,10 @@ size_t QueryMatcher::match(Sequence *seq, float *compositionBias) {
         updateScoreBins(foundDiagonals, hitCount);
         stats->doubleMatches = getDoubleDiagonalMatches();
     }
-    stats->kmersPerPos   = ((double)kmerListLen/(double)seq->L);
-    stats->querySeqLen   = seq->L;
-    stats->dbMatches     = overflowNumMatches + numMatches;
+    stats->kmersPerPos = ((double)kmerListLen/(double)seq->L);
+    stats->querySeqLen = seq->L;
+    stats->dbMatches   = overflowNumMatches + numMatches;
+
     return hitCount;
 }
 
@@ -315,12 +290,12 @@ void QueryMatcher::updateScoreBins(CounterResult *result, size_t elementCount) {
 }
 
 template <int TYPE>
-std::pair<hit_t *, size_t>  QueryMatcher::getResult(CounterResult * results,
-                                                    size_t resultSize,
-                                                    const unsigned int id,
-                                                    const unsigned short thr,
-                                                    UngappedAlignment * align,
-                                                    const int rescaleScore) {
+std::pair<hit_t*, size_t> QueryMatcher::getResult(CounterResult * results,
+                                                  size_t resultSize,
+                                                  const unsigned int id,
+                                                  const unsigned short thr,
+                                                  UngappedAlignment *align,
+                                                  const int rescaleScore) {
     size_t currentHits = 0;
     if (id != UINT_MAX) {
         hit_t *result = (resList + 0);
@@ -379,8 +354,8 @@ std::pair<hit_t *, size_t>  QueryMatcher::getResult(CounterResult * results,
 
 void QueryMatcher::initDiagonalMatcher(size_t dbsize, unsigned int maxDbMatches) {
     uint64_t l2CacheSize = Util::getL2CacheSize();
-#define INIT(x)   cachedOperation##x = new CacheFriendlyOperations<x>(dbsize, maxDbMatches/x); \
-                  activeCounter = x;
+#define INIT(x) cachedOperation##x = new CacheFriendlyOperations<x>(dbsize, maxDbMatches/x); \
+                activeCounter = x;
     if(dbsize/2 < l2CacheSize){
         INIT(2)
     }else if(dbsize/4 < l2CacheSize){
@@ -415,6 +390,19 @@ void QueryMatcher::deleteDiagonalMatcher(unsigned int activeCounter){
 #undef DELETE_CASE
 }
 
+size_t QueryMatcher::findDuplicates(IndexEntryLocal **hitsByIndex,
+                                   CounterResult *output, size_t outputSize,
+                                   unsigned short indexFrom, unsigned short indexTo,
+                                   bool computeTotalScore) {
+    size_t localResultSize = 0;
+#define COUNT_CASE(x) case x: localResultSize += cachedOperation##x->findDuplicates(hitsByIndex, output, outputSize, indexFrom, indexTo, computeTotalScore); break;
+    switch (activeCounter){
+        FOR_EACH(COUNT_CASE,2,4,8,16,32,64,128,256,512,1024,2048)
+    }
+#undef COUNT_CASE
+    return localResultSize;
+}
+
 size_t QueryMatcher::mergeElements(CounterResult *foundDiagonals, size_t hitCounter) {
     size_t overflowHitCount = 0;
 #define MERGE_CASE(x) \
@@ -470,12 +458,8 @@ size_t QueryMatcher::radixSortByScoreSize(const unsigned int * scoreSizes,
 std::pair<size_t, unsigned int> QueryMatcher::rescoreHits(Sequence * querySeq, unsigned int * scoreSizes, CounterResult *results,
         size_t resultSize, UngappedAlignment *align, int lowerBoundScore) {
     size_t elements = 0;
-    unsigned char * query = new unsigned char[querySeq->L];
-    for(int pos = 0; pos < querySeq->L; pos++ ){
-        query[pos] = static_cast<unsigned char>(querySeq->int_sequence[pos]);
-    }
-    int maxSelfScore = align->scoreSingleSequence(std::make_pair(static_cast<const unsigned char*>(query),querySeq->L), 0,0);
-    delete [] query;
+    const unsigned char * query = querySeq->numSequence;
+    int maxSelfScore = align->scoreSingleSequence(std::make_pair(query, querySeq->L), 0,0);
 
     maxSelfScore = std::min(maxSelfScore, USHRT_MAX);
     maxSelfScore = (maxSelfScore-lowerBoundScore);
@@ -511,4 +495,4 @@ template std::pair<hit_t *, size_t>  QueryMatcher::getResult<1>(CounterResult *
 #undef FE_4
 #undef FE_3
 #undef FE_2
-#undef FE_1
\ No newline at end of file
+#undef FE_1
diff --git a/src/prefiltering/QueryMatcher.h b/src/prefiltering/QueryMatcher.h
index 8e4af48..bb003e2 100644
--- a/src/prefiltering/QueryMatcher.h
+++ b/src/prefiltering/QueryMatcher.h
@@ -20,14 +20,16 @@ struct statistics_t{
     size_t querySeqLen;
     size_t diagonalOverflow;
     size_t resultsPassedPrefPerSeq;
-    statistics_t() : kmersPerPos(0.0) , dbMatches(0) , doubleMatches(0), querySeqLen(0), diagonalOverflow(0), resultsPassedPrefPerSeq(0) {};
+    size_t truncated;
+    statistics_t() : kmersPerPos(0.0) , dbMatches(0) , doubleMatches(0), querySeqLen(0), diagonalOverflow(0), resultsPassedPrefPerSeq(0), truncated(0) {};
     statistics_t(double kmersPerPos, size_t dbMatches,
-                 size_t doubleMatches, size_t querySeqLen, size_t diagonalOverflow, size_t resultsPassedPrefPerSeq) : kmersPerPos(kmersPerPos),
+                 size_t doubleMatches, size_t querySeqLen, size_t diagonalOverflow, size_t resultsPassedPrefPerSeq, size_t truncated) : kmersPerPos(kmersPerPos),
                                                                                                                       dbMatches(dbMatches),
                                                                                                                       doubleMatches(doubleMatches),
                                                                                                                       querySeqLen(querySeqLen),
                                                                                                                       diagonalOverflow(diagonalOverflow),
-                                                                                                                      resultsPassedPrefPerSeq(resultsPassedPrefPerSeq){};
+                                                                                                                      resultsPassedPrefPerSeq(resultsPassedPrefPerSeq),
+                                                                                                                      truncated(truncated){};
 };
 
 struct hit_t {
@@ -35,14 +37,14 @@ struct hit_t {
     int prefScore;
     unsigned short diagonal;
 
-    static bool compareHitsByScoreAndId(hit_t first, hit_t second){
-        if(first.prefScore > second.prefScore )
+    static bool compareHitsByScoreAndId(const hit_t &first, const hit_t &second){
+        if (abs(first.prefScore) > abs(second.prefScore))
             return true;
-        if(second.prefScore > first.prefScore )
+        if (abs(second.prefScore) > abs(first.prefScore))
             return false;
-        if(first.seqId < second.seqId )
+        if (first.seqId < second.seqId)
             return true;
-        if(second.seqId < first.seqId )
+        if (second.seqId < first.seqId)
             return false;
         return false;
     }
@@ -59,42 +61,23 @@ class QueryMatcher {
 
     // returns result for the sequence
     // identityId is the id of the identitical sequence in the target database if there is any, UINT_MAX otherwise
-    std::pair<hit_t *, size_t>  matchQuery(Sequence * querySeq, unsigned int identityId);
-
-    // find duplicates in the diagonal bins
-    size_t evaluateBins(IndexEntryLocal **hitsByIndex, CounterResult *output,
-                        size_t outputSize, unsigned short indexFrom, unsigned short indexTo, bool computeTotalScore);
-
-    void updateScoreBins(CounterResult *result, size_t elementCount);
+    std::pair<hit_t*, size_t> matchQuery(Sequence *querySeq, unsigned int identityId);
 
     // set substituion matrix for KmerGenerator
     void setProfileMatrix(ScoreMatrix **matrix){
-        this->kmerGenerator->setDivideStrategy(matrix );
+        kmerGenerator->setDivideStrategy(matrix);
     }
 
     // set substitution matrix
-    void setSubstitutionMatrix(ScoreMatrix * three, ScoreMatrix * two) {
-        this->kmerGenerator->setDivideStrategy(three, two );
+    void setSubstitutionMatrix(ScoreMatrix *three, ScoreMatrix *two) {
+        kmerGenerator->setDivideStrategy(three, two);
     }
 
     // get statistics
-    const statistics_t * getStatistics(){
+    const statistics_t *getStatistics() {
         return stats;
     }
 
-    const static size_t SCORE_RANGE = 256;
-
-    static unsigned int computeScoreThreshold(unsigned int * scoreSizes, size_t maxHitsPerQuery) {
-        size_t foundHits = 0;
-        size_t scoreThr = 0;
-        for(scoreThr = SCORE_RANGE - 1; scoreThr > 0 ; scoreThr--){
-            foundHits += scoreSizes[scoreThr];
-            if(foundHits >= maxHitsPerQuery)
-                break;
-        }
-        return scoreThr;
-    }
-
     static hit_t parsePrefilterHit(char* data) {
         hit_t result;
         const char *wordCnt[255];
@@ -120,8 +103,15 @@ class QueryMatcher {
         return ret;
     }
 
-    static size_t prefilterHitToBuffer(char *buff1, hit_t &h)
-    {
+    static void parsePrefilterHits(char *data, std::vector<hit_t> &entries) {
+        while (*data != '\0') {
+            hit_t result = parsePrefilterHit(data);
+            entries.push_back(result);
+            data = Util::skipLine(data);
+        }
+    }
+
+    static size_t prefilterHitToBuffer(char *buff1, hit_t &h) {
         char * basePos = buff1;
         char * tmpBuff = Itoa::u32toa_sse2((uint32_t) h.seqId, buff1);
         *(tmpBuff-1) = '\t';
@@ -140,15 +130,15 @@ class QueryMatcher {
     const static int UNGAPPED_DIAGONAL_SCORE = 1;
 
     // keeps stats for run
-    statistics_t * stats;
+    statistics_t *stats;
     // scoring matrix for local amino acid bias correction
-    BaseMatrix * kmerSubMat;
+    BaseMatrix *kmerSubMat;
     // scoring matrix for ungapped alignment
-    BaseMatrix * ungappedAlignmentSubMat;
+    BaseMatrix *ungappedAlignmentSubMat;
     /* generates kmer lists */
-    KmerGenerator * kmerGenerator;
+    KmerGenerator *kmerGenerator;
     /* contains the sequences for a kmer */
-    IndexTable * indexTable;
+    IndexTable *indexTable;
     // k of the k-mer
     int kmerSize;
     // local amino acid bias correction
@@ -164,19 +154,6 @@ class QueryMatcher {
     // result hit buffer
     //CacheFriendlyOperations * diagonalMatcher;
     unsigned int activeCounter;
-#define CacheFriendlyOperations(x)  CacheFriendlyOperations<x> * cachedOperation##x
-    CacheFriendlyOperations(2);
-    CacheFriendlyOperations(4);
-    CacheFriendlyOperations(8);
-    CacheFriendlyOperations(16);
-    CacheFriendlyOperations(32);
-    CacheFriendlyOperations(64);
-    CacheFriendlyOperations(128);
-    CacheFriendlyOperations(256);
-    CacheFriendlyOperations(512);
-    CacheFriendlyOperations(1024);
-    CacheFriendlyOperations(2048);
-#undef CacheFriendlyOperations
 
     // matcher for diagonal
     UngappedAlignment *ungappedAlignment;
@@ -191,19 +168,46 @@ class QueryMatcher {
     IndexEntryLocal **indexPointer;
 
     // keeps data in inner loop
-    IndexEntryLocal * __restrict databaseHits;
+    IndexEntryLocal *__restrict databaseHits;
 
     // evaluated bins
-    CounterResult * foundDiagonals;
+    CounterResult *foundDiagonals;
+
+    // size of max diagonalMatcher result objects
+    size_t foundDiagonalsSize;
 
     // last data pointer (for overflow check)
-    IndexEntryLocal * lastSequenceHit;
+    IndexEntryLocal *lastSequenceHit;
 
     // max seq. per query
     size_t maxHitsPerQuery;
 
+    float *compositionBias;
+
+    // diagonal scoring active
+    bool diagonalScoring;
+    unsigned int minDiagScoreThr;
+
+    Indexer idx;
+
+    const static size_t SCORE_RANGE = 256;
+
+    void updateScoreBins(CounterResult *result, size_t elementCount);
+
+    static unsigned int computeScoreThreshold(unsigned int * scoreSizes, size_t maxHitsPerQuery) {
+        size_t foundHits = 0;
+        size_t scoreThr = 0;
+        for (scoreThr = SCORE_RANGE - 1; scoreThr > 0 ; scoreThr--) {
+            foundHits += scoreSizes[scoreThr];
+            if (foundHits >= maxHitsPerQuery) {
+                break;
+            }
+        }
+        return scoreThr;
+    }
+
     // match sequence against the IndexTable
-    size_t match(Sequence *seq, float *pDouble);
+    size_t match(Sequence *seq, float *compositionBias);
 
     // extract result from databaseHits
     template <int TYPE>
@@ -216,30 +220,39 @@ class QueryMatcher {
     // compute double hits
     size_t getDoubleDiagonalMatches();
 
-    float *compositionBias;
+    size_t radixSortByScoreSize(const unsigned int *scoreSizes,
+                                CounterResult *writePos, const unsigned int scoreThreshold,
+                                const CounterResult *results, const size_t resultSize);
 
-    // diagonal scoring active
-    bool diagonalScoring;
-    unsigned int minDiagScoreThr;
-    // size of max diagonalMatcher result objects
-    size_t counterResultSize;
+    std::pair<size_t, unsigned int> rescoreHits(Sequence * querySeq, unsigned int *scoreSizes, CounterResult *results,
+                                                size_t resultSize, UngappedAlignment *align, int lowerBoundScore);
 
-    Indexer idx;
+#define CacheFriendlyOperations(x)  CacheFriendlyOperations<x> * cachedOperation##x
+    CacheFriendlyOperations(2);
+    CacheFriendlyOperations(4);
+    CacheFriendlyOperations(8);
+    CacheFriendlyOperations(16);
+    CacheFriendlyOperations(32);
+    CacheFriendlyOperations(64);
+    CacheFriendlyOperations(128);
+    CacheFriendlyOperations(256);
+    CacheFriendlyOperations(512);
+    CacheFriendlyOperations(1024);
+    CacheFriendlyOperations(2048);
+#undef CacheFriendlyOperations
 
     void initDiagonalMatcher(size_t dbsize, unsigned int maxDbMatches);
 
     void deleteDiagonalMatcher(unsigned int activeCounter);
 
-    size_t mergeElements(CounterResult *foundDiagonals, size_t hitCounter);
+    // find duplicates in the diagonal bins
+    size_t findDuplicates(IndexEntryLocal **hitsByIndex, CounterResult *output,
+                          size_t outputSize, unsigned short indexFrom, unsigned short indexTo, bool computeTotalScore);
 
-    size_t keepMaxScoreElementOnly(CounterResult *foundDiagonals, size_t resultSize);
 
-    size_t radixSortByScoreSize(const unsigned int *scoreSizes,
-                              CounterResult *writePos, const unsigned int scoreThreshold,
-                              const CounterResult *results, const size_t resultSize);
+    size_t mergeElements(CounterResult *foundDiagonals, size_t hitCounter);
 
-    std::pair<size_t, unsigned int> rescoreHits(Sequence * querySeq, unsigned int *scoreSizes, CounterResult *results,
-                                                size_t resultSize, UngappedAlignment *align, int lowerBoundScore);
+    size_t keepMaxScoreElementOnly(CounterResult *foundDiagonals, size_t resultSize);
 };
 
-#endif //MMSEQS_QUERYTEMPLATEMATCHEREXACTMATCH_H
\ No newline at end of file
+#endif //MMSEQS_QUERYTEMPLATEMATCHEREXACTMATCH_H
diff --git a/src/prefiltering/ReducedMatrix.cpp b/src/prefiltering/ReducedMatrix.cpp
index 940701e..fee2470 100644
--- a/src/prefiltering/ReducedMatrix.cpp
+++ b/src/prefiltering/ReducedMatrix.cpp
@@ -3,7 +3,7 @@
 #include "Util.h"
 
 ReducedMatrix::ReducedMatrix(double **probMatrix, float ** rMatrix,
-                             int* aa2int, char* int2aa,
+                             unsigned char* aa2num, char* num2aa,
                              size_t orgAlphabetSize,
                              size_t reducedAlphabetSize, float bitFactor){
     if(reducedAlphabetSize >= orgAlphabetSize) {
@@ -13,10 +13,10 @@ ReducedMatrix::ReducedMatrix(double **probMatrix, float ** rMatrix,
     initMatrixMemory(orgAlphabetSize);
     // swap the matrix and alphabet mappings
     this->origAlphabetSize = orgAlphabetSize;
-    this->orig_aa2int = new int[UCHAR_MAX];
-    memcpy(orig_aa2int, aa2int, sizeof(int) * UCHAR_MAX);
-    this->orig_int2aa = new char[orgAlphabetSize];
-    memcpy(orig_int2aa, int2aa, sizeof(char) * orgAlphabetSize);
+    this->orig_aa2num = new unsigned char[UCHAR_MAX];
+    memcpy(orig_aa2num, aa2num, sizeof(unsigned char) * UCHAR_MAX);
+    this->orig_num2aa = new char[orgAlphabetSize];
+    memcpy(orig_num2aa, num2aa, sizeof(char) * orgAlphabetSize);
 
     for(size_t i = 0; i < this->origAlphabetSize; i++) {
         for (size_t j = 0; j < this->origAlphabetSize; j++) {
@@ -25,10 +25,10 @@ ReducedMatrix::ReducedMatrix(double **probMatrix, float ** rMatrix,
     }
     // initialize new matrices and alphabet mappings
     this->alphabetSize = reducedAlphabetSize;
-    for (size_t i = 0; i < UCHAR_MAX; ++i) { this->aa2int[i] = orig_aa2int[i]; };
+    for (size_t i = 0; i < UCHAR_MAX; ++i) { this->aa2num[i] = orig_aa2num[i]; };
     for (size_t i = 0; i < origAlphabetSize; ++i){
-        this->int2aa[i] = orig_int2aa[i];
-        reducedAlphabet.push_back(this->int2aa[i]);
+        this->num2aa[i] = orig_num2aa[i];
+        reducedAlphabet.push_back(this->num2aa[i]);
     }
 
     double ** subMatrix_tmp=new double*[origAlphabetSize-1];
@@ -68,17 +68,17 @@ ReducedMatrix::ReducedMatrix(double **probMatrix, float ** rMatrix,
         // Debug(Debug::INFO)  << lost_aa  << " -> " << reduced_aa << "\n";
         reducedAlphabet.erase(reducedAlphabet.begin() + lost_index);
 
-        int reduced_int=this->orig_aa2int[(int)reduced_aa];
-        int lost_int   =this->aa2int[(int)lost_aa];
+        int reduced_int= this->orig_aa2num[(int)reduced_aa];
+        int lost_int   = this->aa2num[static_cast<int>(lost_aa)];
 
         for (size_t i = 0; i < this->origAlphabetSize; i++) {
-            if(this->int2aa[i]==lost_aa){
-                this->int2aa[i]=reduced_aa;
+            if(this->num2aa[i]==lost_aa){
+                this->num2aa[i]=reduced_aa;
             }
         }
         for (int i =0; i < UCHAR_MAX; i++) {
-            if (this->aa2int[i]==lost_int) {
-                this->aa2int[i] = (int) reduced_int;
+            if (this->aa2num[i]==lost_int) {
+                this->aa2num[i] = (int) reduced_int;
             }
         }
         copyMatrix(probMatrix_new, this->probMatrix, origAlphabetSize-1);
@@ -86,24 +86,24 @@ ReducedMatrix::ReducedMatrix(double **probMatrix, float ** rMatrix,
 
     // map big index to new small index
     Debug(Debug::INFO) << "Reduced amino acid alphabet: ";
-    int* aa2int_new = new int[UCHAR_MAX+1];
+    unsigned char* aa2num_new = new unsigned char[UCHAR_MAX+1];
     for (int i = 0; i <= UCHAR_MAX; ++i){
-        aa2int_new[i] = -1;
+        aa2num_new[i] = -1;
     }
-    char* int2aa_new = new char[origAlphabetSize];
+    char* num2aa_new = new char[origAlphabetSize];
     for(size_t i = 0; i<reducedAlphabet.size(); i++){
         const char representative_aa = reducedAlphabet.at(i);
         Debug(Debug::INFO) << "(" << representative_aa;
         for(size_t j =0; j < UCHAR_MAX; j++){
-            if(this->aa2int[(int)j] == this->aa2int[(int)representative_aa]){
+            if(this->aa2num[static_cast<int>(j)] == this->aa2num[static_cast<int>(representative_aa)]){
                 if(j>=65 && j <=90 && static_cast<char>(j) != representative_aa && representative_aa != 'X'){ // only upper case letters
                     Debug(Debug::INFO) << " " << static_cast<char>(j);
                 }
-                aa2int_new[j] = i;
+                aa2num_new[j] = i;
             }
         }
         Debug(Debug::INFO) << ") ";
-        int2aa_new[i] = representative_aa;
+        num2aa_new[i] = representative_aa;
     }
     Debug(Debug::INFO) << "\n";
 
@@ -112,14 +112,14 @@ ReducedMatrix::ReducedMatrix(double **probMatrix, float ** rMatrix,
 
     // compute X background
     for (int i = 0; i < alphabetSize - 1; i++) {
-        pBack[i] = pBack[i] * (1.0 - pBack[aa2int[(int)'X']]);
+        pBack[i] = pBack[i] * (1.0 - pBack[aa2num[(int)'X']]);
     }
 
     double * origpBack=new double[origAlphabetSize];
     computeBackground(probMatrix, origpBack, origAlphabetSize, true);
     // copy old X state
     for (int i = 0; i < this->alphabetSize; i++) {
-        int oldIndex = aa2int[(int)int2aa_new[i]];
+        int oldIndex = aa2num[(int)num2aa_new[i]];
         double Pab = probMatrix[oldIndex][origAlphabetSize-1] / ( origpBack[oldIndex] * origpBack[origAlphabetSize-1]);
         probMatrix_new[alphabetSize-1][i] = Pab * pBack[i] * pBack[alphabetSize-1];
         probMatrix_new[i][alphabetSize-1] = Pab * pBack[alphabetSize-1] * pBack[i];
@@ -128,11 +128,11 @@ ReducedMatrix::ReducedMatrix(double **probMatrix, float ** rMatrix,
     generateSubMatrix(probMatrix_new, rMatrix, this->subMatrix, alphabetSize, true, bitFactor, 0.0);
 
 
-    delete[] this->int2aa;
-    delete[] this->aa2int;
+    delete[] this->num2aa;
+    delete[] this->aa2num;
 
-    this->int2aa = int2aa_new;
-    this->aa2int = aa2int_new;
+    this->num2aa = num2aa_new;
+    this->aa2num = aa2num_new;
 
 
     setupLetterMapping();
@@ -145,8 +145,8 @@ ReducedMatrix::ReducedMatrix(double **probMatrix, float ** rMatrix,
 }
 
 ReducedMatrix::~ReducedMatrix(){
-    delete[] orig_int2aa;
-    delete[] orig_aa2int;
+    delete[] orig_num2aa;
+    delete[] orig_aa2num;
 }
 
 void ReducedMatrix::copyMatrix(double ** input,double ** output, size_t size){
diff --git a/src/prefiltering/ReducedMatrix.h b/src/prefiltering/ReducedMatrix.h
index 2ae37ed..f8d1dfd 100644
--- a/src/prefiltering/ReducedMatrix.h
+++ b/src/prefiltering/ReducedMatrix.h
@@ -1,16 +1,12 @@
 #ifndef ReducedMatrix_H
 #define ReducedMatrix_H
-#include <fstream>
-#include <string>
-#include <vector>
-#include <climits> 
 #include "BaseMatrix.h"
 #include "Debug.h"
 
 class ReducedMatrix : public BaseMatrix {
     public:
         ReducedMatrix(double **probMatrix, float ** rMatrix,
-                      int* aa2int, char* int2aa, size_t orgAlphabetSize,
+                      unsigned char* aa2num, char* num2aa, size_t orgAlphabetSize,
                       size_t reducedAlphabetSize, float bitFactor);
         virtual ~ReducedMatrix();
 
@@ -39,19 +35,19 @@ class ReducedMatrix : public BaseMatrix {
                                 case 'W':
                                 case 'Y':
                                 case 'X':
-                                        this->aa2int[static_cast<int>(letter)] = this->aa2int[static_cast<int>(upperLetter)];
+                                        this->aa2num[static_cast<int>(letter)] = this->aa2num[static_cast<int>(upperLetter)];
                                 break;
                                 case 'J':
-                                        this->aa2int[static_cast<int>(letter)] = this->aa2int[(int)'L'];
+                                        this->aa2num[static_cast<int>(letter)] = this->aa2num[static_cast<int>('L')];
                                 break;
                                 case 'U':
                                 case 'O':
-                                        this->aa2int[static_cast<int>(letter)] = this->aa2int[(int)'X'];
+                                        this->aa2num[static_cast<int>(letter)] = this->aa2num[static_cast<int>('X')];
                                 break;
-                                case 'Z': this->aa2int[static_cast<int>(letter)] = this->aa2int[(int)'E']; break;
-                                case 'B': this->aa2int[static_cast<int>(letter)] = this->aa2int[(int)'D']; break;
+                                case 'Z': this->aa2num[static_cast<int>(letter)] = this->aa2num[static_cast<int>('E')]; break;
+                                case 'B': this->aa2num[static_cast<int>(letter)] = this->aa2num[static_cast<int>('D')]; break;
                                 default:
-                                        this->aa2int[static_cast<int>(letter)] = this->aa2int[(int)'X'];
+                                        this->aa2num[static_cast<int>(letter)] = this->aa2num[static_cast<int>('X')];
                                 break;
                         }
                 }
@@ -59,14 +55,14 @@ class ReducedMatrix : public BaseMatrix {
     private:
 
         /*contains the original matrix before the alphabet reduction*/
-        int*   orig_aa2int;
-        char*  orig_int2aa;
+        unsigned char* orig_aa2num;
+        char*  orig_num2aa;
         /* size of the original alphabet*/
         size_t origAlphabetSize;
 
-        // base class aa2int and int2aa mappings contain now:
-        // aa2int: mapping aa (orig. alphabet) -> int code of the representative amino acid
-        // int2aa: mapping int code (orig. alphabet) -> the representative amino acid char
+        // base class aa2num and num2aa mappings contain now:
+        // aa2num: mapping aa (orig. alphabet) -> int code of the representative amino acid
+        // num2aa: mapping int code (orig. alphabet) -> the representative amino acid char
 
         // reducedAlphabet contains only the "representative" amino acids
         std::vector<char> reducedAlphabet;
diff --git a/src/prefiltering/SequenceLookup.cpp b/src/prefiltering/SequenceLookup.cpp
index 8fca556..09756bf 100644
--- a/src/prefiltering/SequenceLookup.cpp
+++ b/src/prefiltering/SequenceLookup.cpp
@@ -29,16 +29,13 @@ SequenceLookup::~SequenceLookup() {
     }
 }
 
-void SequenceLookup::addSequence(int *seq, int L, size_t index, size_t offset){
+void SequenceLookup::addSequence(unsigned char *seq, int L, size_t index, size_t offset){
     offsets[index] = offset;
-    for(int pos = 0; pos < L; pos++){
-        unsigned char aa = seq[pos];
-        data[offset + pos] = aa;
-    }
+    memcpy(&data[offset], seq, L);
 }
 
 void SequenceLookup::addSequence(Sequence *seq) {
-    addSequence(seq->int_sequence, seq->L, currentIndex, currentOffset);
+    addSequence(seq->numSequence, seq->L, currentIndex, currentOffset);
     currentIndex = currentIndex + 1;
     currentOffset = currentOffset + seq->L;
 }
diff --git a/src/prefiltering/SequenceLookup.h b/src/prefiltering/SequenceLookup.h
index b54079d..b789286 100644
--- a/src/prefiltering/SequenceLookup.h
+++ b/src/prefiltering/SequenceLookup.h
@@ -16,7 +16,7 @@ class SequenceLookup {
     ~SequenceLookup();
 
     // add sequence at offset
-    void addSequence(int *seq, int L, size_t index, size_t offset);
+    void addSequence(unsigned char *seq, int L, size_t index, size_t offset);
 
     // add sequence to index
     void addSequence(Sequence * seq);
diff --git a/src/prefiltering/UngappedAlignment.cpp b/src/prefiltering/UngappedAlignment.cpp
index 4123c0f..dbfea89 100644
--- a/src/prefiltering/UngappedAlignment.cpp
+++ b/src/prefiltering/UngappedAlignment.cpp
@@ -96,16 +96,8 @@ simd_int UngappedAlignment::vectorDiagonalScoring(const char *profile,
         // _mm_shuffle_epi8
         // for i ... 16
         //   score01[i] = score_matrix_vec01[template01[i]%16]
-#ifdef NEON
-        __m128i score01 =vreinterpretq_m128i_u8(vqtbl1q_u8(vreinterpretq_u8_m128i(score_matrix_vec01),vreinterpretq_u8_m128i(template01)));
-#else
         __m128i score01 =_mm_shuffle_epi8(score_matrix_vec01,template01);
-#endif
-#ifdef NEON
-        __m128i score16 =vreinterpretq_m128i_u8(vqtbl1q_u8(vreinterpretq_u8_m128i(score_matrix_vec16),vreinterpretq_u8_m128i(template01)));
-#else
         __m128i score16 =_mm_shuffle_epi8(score_matrix_vec16,template01);
-#endif
         // t[i] < 16 => 0 - 15
         // example: template01: 02 15 12 18 < 16 16 16 16 => FF FF FF 00
         __m128i lookup_mask01 = _mm_cmplt_epi8(template01, sixten);
@@ -356,7 +348,7 @@ short UngappedAlignment::createProfile(Sequence *seq,
         }
     }else{
         for (int pos = 0; pos < seq->L; pos++) {
-            unsigned int aaIdx = seq->int_sequence[pos];
+            unsigned int aaIdx = seq->numSequence[pos];
             for (int i = 0; i < subMatrix->alphabetSize; i++) {
                 queryProfile[pos * PROFILESIZE + i] = (subMat[aaIdx][i] + aaCorrectionScore[pos] + bias);
             }
diff --git a/src/prefiltering/ungappedprefilter.cpp b/src/prefiltering/ungappedprefilter.cpp
index 3240c32..616fe0f 100644
--- a/src/prefiltering/ungappedprefilter.cpp
+++ b/src/prefiltering/ungappedprefilter.cpp
@@ -102,9 +102,9 @@ int doRescorealldiagonal(Parameters &par, DBReader<unsigned int> &qdbr, DBWriter
 //            qSeq.printProfileStatePSSM();
             if(Parameters::isEqualDbtype(qSeq.getSeqType(), Parameters::DBTYPE_HMM_PROFILE) ||
                Parameters::isEqualDbtype(qSeq.getSeqType(), Parameters::DBTYPE_PROFILE_STATE_PROFILE)){
-                aligner.ssw_init(&qSeq, qSeq.getAlignmentProfile(), subMat, subMat->alphabetSize, 0);
+                aligner.ssw_init(&qSeq, qSeq.getAlignmentProfile(), subMat, 0);
             }else{
-                aligner.ssw_init(&qSeq, tinySubMat, subMat, subMat->alphabetSize, 0);
+                aligner.ssw_init(&qSeq, tinySubMat, subMat, 0);
             }
 
             for (size_t tId = 0; tId < tdbr->getSize(); tId++) {
@@ -120,7 +120,7 @@ int doRescorealldiagonal(Parameters &par, DBReader<unsigned int> &qdbr, DBWriter
                     continue;
                 }
 
-                int score = aligner.ungapped_alignment(tSeq.int_sequence, tSeq.L);
+                int score = aligner.ungapped_alignment(tSeq.numSequence, tSeq.L);
                 bool hasDiagScore = (score > par.minDiagScoreThr);
                 double evalue = evaluer->computeEvalue(score, qSeq.L);
                 bool hasEvalue = (evalue <= par.evalThr);
diff --git a/src/taxonomy/CMakeLists.txt b/src/taxonomy/CMakeLists.txt
index 68d970e..cd0a1f7 100644
--- a/src/taxonomy/CMakeLists.txt
+++ b/src/taxonomy/CMakeLists.txt
@@ -9,6 +9,8 @@ set(taxonomy_source_files
         taxonomy/addtaxonomy.cpp
         taxonomy/NcbiTaxonomy.cpp
         taxonomy/filtertaxdb.cpp
+        taxonomy/filtertaxseqdb.cpp
+        taxonomy/aggregatetax.cpp
         taxonomy/createtaxdb.cpp
         taxonomy/taxonomyreport.cpp
         taxonomy/TaxonomyExpression.h
diff --git a/src/taxonomy/NcbiTaxonomy.cpp b/src/taxonomy/NcbiTaxonomy.cpp
index 8cda330..63362c1 100644
--- a/src/taxonomy/NcbiTaxonomy.cpp
+++ b/src/taxonomy/NcbiTaxonomy.cpp
@@ -34,8 +34,6 @@ void deleteMatrix(int** M, size_t maxNodes) {
 
 NcbiTaxonomy::NcbiTaxonomy(const std::string &namesFile,  const std::string &nodesFile,
                            const std::string &mergedFile) {
-    InitLevels();
-
     loadNodes(nodesFile);
     loadMerged(mergedFile);
     loadNames(namesFile);
@@ -68,46 +66,6 @@ NcbiTaxonomy::~NcbiTaxonomy() {
     deleteMatrix(M, maxNodes);
 }
 
-void NcbiTaxonomy::InitLevels() {
-    sortedLevels["forma"] = 1;
-    sortedLevels["varietas"] = 2;
-    sortedLevels["subspecies"] = 3;
-    sortedLevels["species"] = 4;
-    sortedLevels["species subgroup"] = 5;
-    sortedLevels["species group"] = 6;
-    sortedLevels["subgenus"] = 7;
-    sortedLevels["genus"] = 8;
-    sortedLevels["subtribe"] = 9;
-    sortedLevels["tribe"] = 10;
-    sortedLevels["subfamily"] = 11;
-    sortedLevels["family"] = 12;
-    sortedLevels["superfamily"] = 13;
-    sortedLevels["parvorder"] = 14;
-    sortedLevels["infraorder"] = 15;
-    sortedLevels["suborder"] = 16;
-    sortedLevels["order"] = 17;
-    sortedLevels["superorder"] = 18;
-    sortedLevels["infraclass"] = 19;
-    sortedLevels["subclass"] = 20;
-    sortedLevels["class"] = 21;
-    sortedLevels["superclass"] = 22;
-    sortedLevels["subphylum"] = 23;
-    sortedLevels["phylum"] = 24;
-    sortedLevels["superphylum"] = 25;
-    sortedLevels["subkingdom"] = 26;
-    sortedLevels["kingdom"] = 27;
-    sortedLevels["superkingdom"] = 28;
-
-    shortRank["species"] = 's';
-    shortRank["genus"] = 'g';
-    shortRank["family"] = 'f';
-    shortRank["order"] = 'o';
-    shortRank["class"] = 'c';
-    shortRank["phylum"] = 'p';
-    shortRank["kingdom"] = 'k';
-    shortRank["superkingdom"] = 'd';
-}
-
 std::vector<std::string> splitByDelimiter(const std::string &s, const std::string &delimiter, int maxCol) {
     std::vector<std::string> result;
     size_t prev = 0, pos = 0;
@@ -331,11 +289,7 @@ std::vector<std::string> NcbiTaxonomy::AtRanks(TaxonNode const *node, const std:
     std::vector<std::string> result;
     std::map<std::string, std::string> allRanks = AllRanks(node);
     // map does not include "no rank" nor "no_rank"
-    int baseRankIndex = -1;
-    if (sortedLevels.find(node->rank) != sortedLevels.end()) {
-        // found rank in map:
-        baseRankIndex = sortedLevels.at(node->rank);
-    }
+    int baseRankIndex = findRankIndex(node->rank);
     std::string baseRank = "uc_" + node->name;
     for (std::vector<std::string>::const_iterator it = levels.begin(); it != levels.end(); ++it) {
         std::map<std::string, std::string>::iterator jt = allRanks.find(*it);
@@ -345,7 +299,7 @@ std::vector<std::string> NcbiTaxonomy::AtRanks(TaxonNode const *node, const std:
         }
 
         // If not ... 2 possible causes: i) too low level ("uc_")
-        if (sortedLevels.at(*it) < baseRankIndex) {
+        if (NcbiRanks.at(*it) < baseRankIndex) {
             result.emplace_back(baseRank);
             continue;
         }
@@ -356,13 +310,31 @@ std::vector<std::string> NcbiTaxonomy::AtRanks(TaxonNode const *node, const std:
     return result;
 }
 
-char NcbiTaxonomy::getShortRank(const std::string& rank) const {
-    std::map<std::string, char>::const_iterator it = shortRank.find(rank);
-    if (it == shortRank.end()) {
-        return '-';
-    } else {
+std::vector<std::string> NcbiTaxonomy::parseRanks(const std::string& ranks) {
+    std::vector<std::string> temp = Util::split(ranks, ",");
+    for (size_t i = 0; i < temp.size(); ++i) {
+        if (findRankIndex(temp[i]) == -1) {
+            Debug(Debug::ERROR) << "Invalid taxonomic rank " << temp[i] << "given\n";
+            EXIT(EXIT_FAILURE);
+        }
+    }
+    return temp;
+}
+
+int NcbiTaxonomy::findRankIndex(const std::string& rank) {
+    std::map<std::string, int>::const_iterator it;
+    if ((it = NcbiRanks.find(rank)) != NcbiRanks.end()) {
+        return it->second;
+    }
+    return -1;
+}
+
+char NcbiTaxonomy::findShortRank(const std::string& rank) {
+    std::map<std::string, char>::const_iterator it;
+    if ((it = NcbiShortRanks.find(rank)) != NcbiShortRanks.end()) {
         return it->second;
     }
+    return '-';
 }
 
 std::string NcbiTaxonomy::taxLineage(TaxonNode const *node) {
@@ -375,7 +347,7 @@ std::string NcbiTaxonomy::taxLineage(TaxonNode const *node) {
     } while (node->parentTaxId != node->taxId);
 
     for (int i = taxLineageVec.size() - 1; i >= 0; --i) {
-        taxLineage += getShortRank(taxLineageVec[i]->rank);
+        taxLineage += findShortRank(taxLineageVec[i]->rank);
         taxLineage += '_';
         taxLineage += taxLineageVec[i]->name;
         if (i > 0) {
diff --git a/src/taxonomy/NcbiTaxonomy.h b/src/taxonomy/NcbiTaxonomy.h
index 859b8a6..1dc12a9 100644
--- a/src/taxonomy/NcbiTaxonomy.h
+++ b/src/taxonomy/NcbiTaxonomy.h
@@ -30,6 +30,45 @@ struct TaxonCounts {
     std::vector<TaxID> children; // list of children
 };
 
+
+static const std::map<std::string, int> NcbiRanks = {{ "forma", 1 },
+                                                     { "varietas", 2 },
+                                                     { "subspecies", 3 },
+                                                     { "species", 4 },
+                                                     { "species subgroup", 5 },
+                                                     { "species group", 6 },
+                                                     { "subgenus", 7 },
+                                                     { "genus", 8 },
+                                                     { "subtribe", 9 },
+                                                     { "tribe", 10 },
+                                                     { "subfamily", 11 },
+                                                     { "family", 12 },
+                                                     { "superfamily", 13 },
+                                                     { "parvorder", 14 },
+                                                     { "infraorder", 15 },
+                                                     { "suborder", 16 },
+                                                     { "order", 17 },
+                                                     { "superorder", 18 },
+                                                     { "infraclass", 19 },
+                                                     { "subclass", 20 },
+                                                     { "class", 21 },
+                                                     { "superclass", 22 },
+                                                     { "subphylum", 23 },
+                                                     { "phylum", 24 },
+                                                     { "superphylum", 25 },
+                                                     { "subkingdom", 26 },
+                                                     { "kingdom", 27 },
+                                                     { "superkingdom", 28 }};
+
+static const std::map<std::string, char> NcbiShortRanks = {{ "species", 's' },
+                                                           { "genus", 'g' },
+                                                           { "family", 'f' },
+                                                           { "order", 'o' },
+                                                           { "class", 'c' },
+                                                           { "phylum", 'p' },
+                                                           { "kingdom", 'k' },
+                                                           { "superkingdom", 'd' }};
+
 class NcbiTaxonomy {
 public:
     NcbiTaxonomy(const std::string &namesFile,  const std::string &nodesFile,
@@ -42,6 +81,10 @@ class NcbiTaxonomy {
     std::map<std::string, std::string> AllRanks(TaxonNode const *node) const;
     std::string taxLineage(TaxonNode const *node);
 
+    static std::vector<std::string> parseRanks(const std::string& ranks);
+    static int findRankIndex(const std::string& rank);
+    static char findShortRank(const std::string& rank);
+
     bool IsAncestor(TaxID ancestor, TaxID child);
     TaxonNode const* taxonNode(TaxID taxonId, bool fail = true) const;
     //std::unordered_map<TaxID, unsigned int> getCladeCounts(std::unordered_map<TaxID, unsigned int>& taxonCounts, TaxID taxon = 1) const;
@@ -49,7 +92,6 @@ class NcbiTaxonomy {
 
     static NcbiTaxonomy * openTaxonomy(std::string & database);
 private:
-    void InitLevels();
     size_t loadNodes(const std::string &nodesFile);
     size_t loadMerged(const std::string &mergedFile);
     void loadNames(const std::string &namesFile);
@@ -60,7 +102,6 @@ class NcbiTaxonomy {
 
     int RangeMinimumQuery(int i, int j) const;
     int lcaHelper(int i, int j) const;
-    char getShortRank(const std::string& rank) const;
 
     std::vector<TaxonNode> taxonNodes;
     std::vector<int> D; // maps from taxID to node ID in taxonNodes
@@ -70,9 +111,6 @@ class NcbiTaxonomy {
     int **M;
     size_t maxNodes;
 
-    std::map<std::string, int> sortedLevels;
-    std::map<std::string, char> shortRank;
-
 };
 
 #endif
diff --git a/src/taxonomy/TaxonomyExpression.h b/src/taxonomy/TaxonomyExpression.h
index 03706e9..c84b4f5 100644
--- a/src/taxonomy/TaxonomyExpression.h
+++ b/src/taxonomy/TaxonomyExpression.h
@@ -7,133 +7,74 @@
 #include "NcbiTaxonomy.h"
 #include "Debug.h"
 #include <vector>
+#include <ctype.h>
+#include "ExpressionParser.h"
 
+// class need one instance per thread
 class TaxonomyExpression{
-    struct TaxonomyTerm{
-        unsigned int taxId;
-        bool shouldBeAncestor;
-        bool orTerm;
-        TaxonomyTerm( unsigned int taxId, bool negaitve, bool orTerm)
-                :taxId(taxId), shouldBeAncestor(negaitve), orTerm(orTerm){}
+
+private:
+    struct TaxContext {
+        NcbiTaxonomy* t;
+        TaxID taxId;
     };
-    std::vector<std::vector<TaxonomyTerm>> taxTerms;
+    TaxContext tc;
+    ExpressionParser * parser;
+    std::vector<te_variable> vars;
 
 public:
-    TaxonomyExpression(std::string expression){
-        bool inBracket = false;
-        bool shouldBeAncestor = true;
-        bool isOr = false;
-        bool isAnd = false;
-        unsigned int taxId;
-        std::vector<TaxonomyTerm> term;
-        for(size_t pos = 0; pos < expression.size(); pos++){
-            switch(expression[pos]) {
-                case '(':
-                    if(inBracket == true){
-                        Debug(Debug::ERROR) << "Error in expression " << expression << ". It is not allowed to open another bracket within a bracket terms\n";
-                        EXIT(EXIT_FAILURE);
-                    }
-                    inBracket = true;
-                    break;
-                case ')':
-                    if(inBracket == false){
-                        Debug(Debug::ERROR) << "Error in expression " << expression << " closes a bracket without opening it\n";
-                        EXIT(EXIT_FAILURE);
-                    }
-                    inBracket = false;
-                    break;
-                case '!':
-                    shouldBeAncestor = false;
-                    break;
-                case '&':
-                    if(inBracket == false){
-                        Debug(Debug::ERROR) << "It is not supported to use & without a bracket\n";
-                        EXIT(EXIT_FAILURE);
-                    }
-                    if(isOr == true){
-                        Debug(Debug::ERROR) << "It is not supported to mix & and | \n";
-                        EXIT(EXIT_FAILURE);
-                    }
-                    shouldBeAncestor = true;
-                    isAnd = true;
-                    break;
-                case '|':
-                    if(inBracket == false){
-                        Debug(Debug::ERROR) << "It is not supported to use | without a bracket\n";
-                        EXIT(EXIT_FAILURE);
-                    }
-                    if(isAnd == true){
-                        Debug(Debug::ERROR) << "It is not supported to mix & and | \n";
-                        EXIT(EXIT_FAILURE);
-                    }
-                    isOr = true;
-                    break;
-                case ',':
-                    shouldBeAncestor = true;
-                    isOr = false;
-                    isAnd = false;
-                    if(inBracket == false){
-                        taxTerms.push_back(term);
-                        term.clear();
-                    }else{
-                        Debug(Debug::ERROR) << "Error in expression " << expression << ". It is not allowed to use , within bracket terms only use &\n";
-                        EXIT(EXIT_FAILURE);
-                    }
-                    break;
-                case '0':
-                case '1':
-                case '2':
-                case '3':
-                case '4':
-                case '5':
-                case '6':
-                case '7':
-                case '8':
-                case '9':
-                    taxId = Util::fast_atoi<unsigned int>(&expression[pos]);
-                    while(pos < expression.size() && expression[pos] >= '0' && expression[pos] <= '9'){
-                        pos++;
-                    }
-                    pos--;
-                    term.emplace_back(taxId, shouldBeAncestor, isOr);
-                    shouldBeAncestor = true;
-                    break;
-                default:
-                    Debug(Debug::ERROR) << "Wrong character in expression: " << expression[pos] << "\n";
-                    EXIT(EXIT_FAILURE);
-                    break;
-            }
-        }
-        taxTerms.push_back(term);
+
+    static double acst(void* context, double a) {
+        TaxContext* o = (TaxContext*)context;
+        bool retVal = o->t->IsAncestor((TaxID)a, o->taxId);
+        return (retVal) ? 1.0 : 0.0;
     }
 
-    bool isAncestor(NcbiTaxonomy &taxonomy, std::vector<TaxonomyTerm> &termToCheck, unsigned int taxId){
-        size_t ancestorCnt = 0;
-        for (size_t j = 0; j < termToCheck.size(); ++j) {
-                ancestorCnt += (taxonomy.IsAncestor(termToCheck[j].taxId, taxId) == termToCheck[j].shouldBeAncestor);
-        }
-        if(termToCheck.back().orTerm){
-            return (ancestorCnt >= 1);
-        }else{
-            return (ancestorCnt == termToCheck.size());
+    TaxonomyExpression(std::string expression, NcbiTaxonomy &taxonomy){
+        std::string bracketExpression;
+        bool inNumber = false;
+        // make brackets around numbers for tinyexpr
+        for(size_t i = 0; i< expression.size(); i++){
+            if(isdigit(expression[i]) && inNumber == true){
+                bracketExpression.push_back(expression[i]);
+            }else if(isdigit(expression[i]) && inNumber == false){
+                bracketExpression.append("a(");
+                bracketExpression.push_back(expression[i]);
+                inNumber=true;
+            } else if(inNumber == true) {
+                bracketExpression.append(")");
+                bracketExpression.push_back(expression[i]);
+                inNumber=false;
+            }else{
+                bracketExpression.push_back(expression[i]);
+            }
         }
-    }
-    // this function returns the index of the term that fulfils the criteria
-    // -1 means no term fulfils the criteria
-    int isAncestorOf(NcbiTaxonomy &taxonomy, unsigned int taxId){
-        int index = -1;
-        bool ancestor = false;
-        for (size_t j = 0; j < taxTerms.size() && !ancestor; ++j) {
-            ancestor |= isAncestor(taxonomy, taxTerms[j], taxId);
-            index = (ancestor) ? j : index;
+        if(inNumber == true){
+            bracketExpression.append(")");
         }
-        return index;
+        tc.t = &taxonomy;
+        te_variable var;
+        var.name = "a";
+        // GCC 4.8 does not like casting functions to void*
+        // GCC > 4.8 are fine with this
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpedantic"
+        var.address = (const void *) &acst;
+#pragma GCC diagnostic pop
+        var.type = TE_CLOSURE1;
+        var.context = (void *) &tc;
+        vars.push_back(var);
+        parser=new ExpressionParser(bracketExpression.c_str(), vars);
     }
 
-
-    std::vector<std::vector<TaxonomyTerm>> getTaxTerms(){
-        return taxTerms;
+    ~TaxonomyExpression(){
+        delete parser;
     }
 
+    bool isAncestor(TaxID taxId){
+        tc.taxId = taxId;
+        const double result = parser->evaluate();
+        return (result != 0);
+    }
 };
 #endif //MMSEQS_TAXONOMYEXPRESSION_H
diff --git a/src/taxonomy/addtaxonomy.cpp b/src/taxonomy/addtaxonomy.cpp
index 3cc01b1..828535e 100644
--- a/src/taxonomy/addtaxonomy.cpp
+++ b/src/taxonomy/addtaxonomy.cpp
@@ -11,48 +11,50 @@
 #endif
 
 
-static bool compareToFirstInt(const std::pair<unsigned int, unsigned int>& lhs, const std::pair<unsigned int, unsigned int>&  rhs){
+static bool compareToFirstInt(const std::pair<unsigned int, unsigned int> &lhs, const std::pair<unsigned int, unsigned int> &rhs) {
     return (lhs.first <= rhs.first);
 }
 
-int addtaxonomy(int argc, const char **argv, const Command& command) {
-    Parameters& par = Parameters::getInstance();
+int addtaxonomy(int argc, const char **argv, const Command &command) {
+    Parameters &par = Parameters::getInstance();
     par.parseParameters(argc, argv, command, true, 0, 0);
 
-    NcbiTaxonomy * t = NcbiTaxonomy::openTaxonomy(par.db1);
-
-    std::vector< std::pair<unsigned int, unsigned int> > mapping;
-    if(FileUtil::fileExists(std::string(par.db1 + "_mapping").c_str()) == false){
-        Debug(Debug::ERROR) << par.db1 + "_mapping" << " does not exist. Please create the taxonomy mapping!\n";
+    std::vector<std::pair<unsigned int, unsigned int>> mapping;
+    if (FileUtil::fileExists((par.db1 + "_mapping").c_str()) == false) {
+        Debug(Debug::ERROR) << par.db1 << "_mapping does not exist. Run createtaxdb to create taxonomy mapping.\n";
         EXIT(EXIT_FAILURE);
     }
-    bool isSorted = Util::readMapping( par.db1 + "_mapping", mapping);
-    if(isSorted == false){
+    const bool isSorted = Util::readMapping(par.db1 + "_mapping", mapping);
+    if (isSorted == false) {
         std::stable_sort(mapping.begin(), mapping.end(), compareToFirstInt);
     }
-    std::vector<std::string> ranks = Util::split(par.lcaRanks, ":");
+    if (mapping.size() == 0) {
+        Debug(Debug::ERROR) << par.db1 << "_mapping is empty. Rerun createtaxdb to recreate taxonomy mapping.\n";
+        EXIT(EXIT_FAILURE);
+    }
+    NcbiTaxonomy *t = NcbiTaxonomy::openTaxonomy(par.db1);
+    std::vector<std::string> ranks = NcbiTaxonomy::parseRanks(par.lcaRanks);
 
-    DBReader<unsigned int> reader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
+    DBReader<unsigned int> reader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_INDEX);
     reader.open(DBReader<unsigned int>::LINEAR_ACCCESS);
 
     DBWriter writer(par.db3.c_str(), par.db3Index.c_str(), par.threads, par.compressed, reader.getDbtype());
     writer.open();
 
-    Debug(Debug::INFO) << "Add taxonomy information \n";
-    size_t taxonNotFound=0;
-    Debug::Progress progress(reader.getSize());
+    size_t taxonNotFound = 0;
     size_t deletedNodes = 0;
-    #pragma omp parallel
+    Debug::Progress progress(reader.getSize());
+#pragma omp parallel
     {
         unsigned int thread_idx = 0;
 #ifdef OPENMP
         thread_idx = (unsigned int) omp_get_thread_num();
 #endif
         const char *entry[255];
-        std::string resultData;
-        resultData.reserve(4096);
+        std::string result;
+        result.reserve(4096);
 
-        #pragma omp for schedule(dynamic, 10) reduction (+: deletedNodes, taxonNotFound)
+#pragma omp for schedule(dynamic, 10) reduction (+: deletedNodes, taxonNotFound)
         for (size_t i = 0; i < reader.getSize(); ++i) {
             progress.updateProgress();
 
@@ -64,66 +66,62 @@ int addtaxonomy(int argc, const char **argv, const Command& command) {
                 continue;
             }
             std::pair<unsigned int, unsigned int> val;
-            std::vector< std::pair<unsigned int, unsigned int> >::iterator mappingIt;
-            if(par.pickIdFrom == Parameters::EXTRACT_QUERY){
+            std::vector<std::pair<unsigned int, unsigned int> >::iterator mappingIt;
+            if (par.pickIdFrom == Parameters::EXTRACT_QUERY) {
                 val.first = key;
                 mappingIt = std::upper_bound(mapping.begin(), mapping.end(), val, compareToFirstInt);
             }
+            if (mappingIt == mapping.end() || mappingIt->first != val.first) {
+                taxonNotFound++;
+                continue;
+            }
 
-            std::vector<int> taxa;
             while (*data != '\0') {
                 const size_t columns = Util::getWordsOfLine(data, entry, 255);
                 if (columns == 0) {
-                    Debug(Debug::WARNING) << "Empty entry: " << i << "!";
+                    Debug(Debug::WARNING) << "Empty entry: " << i << "\n";
                     data = Util::skipLine(data);
                     continue;
                 }
-                if(par.pickIdFrom == Parameters::EXTRACT_TARGET){
+                if (par.pickIdFrom == Parameters::EXTRACT_TARGET) {
                     unsigned int id = Util::fast_atoi<unsigned int>(entry[0]);
                     val.first = id;
                     mappingIt = std::upper_bound(mapping.begin(), mapping.end(), val, compareToFirstInt);
                 }
-                if (mappingIt->first != val.first) {
+                if (mappingIt == mapping.end() || mappingIt->first != val.first) {
                     taxonNotFound++;
-//                    Debug(Debug::WARNING) << "No taxon mapping provided for id " << id << "\n";
                     data = Util::skipLine(data);
                     continue;
                 }
                 unsigned int taxon = mappingIt->second;
-                TaxonNode const * node = t->taxonNode(taxon, false);
-                if(node == NULL){
+                TaxonNode const *node = t->taxonNode(taxon, false);
+                if (node == NULL) {
                     deletedNodes++;
                     data = Util::skipLine(data);
                     continue;
                 }
-                char * nextData = Util::skipLine(data);
+                char *nextData = Util::skipLine(data);
                 size_t dataSize = nextData - data;
-                resultData.append(data, dataSize-1);
-                resultData += '\t' + SSTR(node->taxId) + '\t' + node->rank + '\t' + node->name;
+                result.append(data, dataSize - 1);
+                result += '\t' + SSTR(node->taxId) + '\t' + node->rank + '\t' + node->name;
                 if (!ranks.empty()) {
-                    std::string lcaRanks = Util::implode(t->AtRanks(node, ranks), ':');
-                    resultData += '\t' + lcaRanks;
+                    std::string lcaRanks = Util::implode(t->AtRanks(node, ranks), ';');
+                    result += '\t' + lcaRanks;
                 }
                 if (par.showTaxLineage) {
-                    resultData += '\t' + t->taxLineage(node);
-                }
-                resultData += '\n';
-
-                if(resultData.size() == 0){
-                    Debug(Debug::WARNING) << "Taxon record could not be written. Entry: " << i << "\t" << columns << "!\n";
-                    data = Util::skipLine(data);
-                    continue;
+                    result += '\t' + t->taxLineage(node);
                 }
+                result += '\n';
                 data = Util::skipLine(data);
             }
-            writer.writeData(resultData.c_str(), resultData.size(), key, thread_idx);
-            resultData.clear();
+            writer.writeData(result.c_str(), result.size(), key, thread_idx);
+            result.clear();
         }
     }
-    Debug(Debug::INFO) << "\n";
     Debug(Debug::INFO) << "Taxonomy for " << taxonNotFound << " entries not found and " << deletedNodes << " are deleted\n";
-    delete t;
     writer.close();
     reader.close();
+    delete t;
     return EXIT_SUCCESS;
 }
+
diff --git a/src/taxonomy/aggregatetax.cpp b/src/taxonomy/aggregatetax.cpp
new file mode 100644
index 0000000..475fa06
--- /dev/null
+++ b/src/taxonomy/aggregatetax.cpp
@@ -0,0 +1,172 @@
+#include "NcbiTaxonomy.h"
+#include "Parameters.h"
+#include "DBWriter.h"
+#include "FileUtil.h"
+#include "Debug.h"
+#include "Util.h"
+#include <map>
+#include <algorithm>
+
+#ifdef OPENMP
+#include <omp.h>
+#endif
+
+TaxID selectTaxForSet (const std::vector<TaxID> &setTaxa, NcbiTaxonomy const *taxonomy, const float majorityCutoff) {
+    // count num occurences of each ancestor 
+    std::map<TaxID,unsigned int> ancTaxIdsCounts;
+    size_t totalAssignedSeqs = 0;
+
+    for (size_t i = 0; i < setTaxa.size(); ++i) {
+        TaxID currTaxId = setTaxa[i];
+        // ignore unassigned sequences
+        if (currTaxId == 0) {
+            continue;
+        }
+        TaxonNode const * node = taxonomy->taxonNode(currTaxId, false);
+        if (node == NULL) {
+            Debug(Debug::ERROR) << "taxonid: " << currTaxId << " does not match a legal taxonomy node.\n";
+            EXIT(EXIT_FAILURE);
+        }
+        totalAssignedSeqs++;
+
+        // add count
+        if (ancTaxIdsCounts.find(currTaxId) != ancTaxIdsCounts.end()) {
+            ancTaxIdsCounts[currTaxId]++;
+        } else {
+            ancTaxIdsCounts.insert(std::pair<TaxID,unsigned int>(currTaxId,1));
+        }
+
+        // iterate all ancestors up to the root
+        TaxID currParentTaxId = node->parentTaxId;
+        while (currParentTaxId != currTaxId) {
+            TaxonNode const * node = taxonomy->taxonNode(currParentTaxId, false);
+            currTaxId = currParentTaxId;
+            if (ancTaxIdsCounts.find(currTaxId) != ancTaxIdsCounts.end()) {
+                ancTaxIdsCounts[currTaxId]++;
+            } else {
+                ancTaxIdsCounts.insert(std::pair<TaxID,unsigned int>(currTaxId,1));
+            }
+            currParentTaxId = node->parentTaxId;
+        }
+    }
+
+    // select the lowest ancestor that meets the cutoff
+    int minRank = INT_MAX;
+    TaxID selctedTaxon = 0;
+    float selectedPercent = 0;
+
+    for (std::map<TaxID,unsigned int>::iterator it = ancTaxIdsCounts.begin(); it != ancTaxIdsCounts.end(); it++) {
+        float currPercent = float(it->second) / totalAssignedSeqs;
+        if (currPercent >= majorityCutoff) {
+            TaxID currTaxId = it->first;
+            TaxonNode const * node = taxonomy->taxonNode(currTaxId, false);
+            int currRankInd = NcbiTaxonomy::findRankIndex(node->rank);
+            if (currRankInd > 0) {
+                if ((currRankInd < minRank) || ((currRankInd == minRank) && (currPercent > selectedPercent))) {
+                    selctedTaxon = currTaxId;
+                    minRank = currRankInd;
+                    selectedPercent = currPercent;
+                }
+            }
+        }
+    }
+
+    return (selctedTaxon);
+}
+
+int aggregatetax(int argc, const char **argv, const Command& command) {
+    Parameters& par = Parameters::getInstance();
+    par.parseParameters(argc, argv, command, true, 0, 0);
+
+    // open taxonomy - evolutionary relationships amongst taxa
+    NcbiTaxonomy * t = NcbiTaxonomy::openTaxonomy(par.db1);
+    
+    // open mapping of set to sequence
+    DBReader<unsigned int> setToSeqReader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
+    setToSeqReader.open(DBReader<unsigned int>::LINEAR_ACCCESS);
+
+    // open tax assignments per sequence
+    DBReader<unsigned int> taxSeqReader(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
+    taxSeqReader.open(DBReader<unsigned int>::NOSORT);
+
+    DBWriter writer(par.db4.c_str(), par.db4Index.c_str(), par.threads, par.compressed, Parameters::DBTYPE_TAXONOMICAL_RESULT);
+    writer.open();
+
+    std::vector<std::string> ranks = NcbiTaxonomy::parseRanks(par.lcaRanks);
+
+    Debug::Progress progress(taxSeqReader.getSize());
+
+    #pragma omp parallel
+    {
+        unsigned int thread_idx = 0;
+#ifdef OPENMP
+        thread_idx = (unsigned int) omp_get_thread_num();
+#endif
+        // per thread variables
+        const char *entry[2048];
+        std::vector<TaxID> setTaxa;
+        std::string setTaxStr;
+        setTaxStr.reserve(4096);
+
+        #pragma omp for schedule(dynamic, 10)
+        for (size_t i = 0; i < setToSeqReader.getSize(); ++i) {
+            progress.updateProgress();
+
+            unsigned int setKey = setToSeqReader.getDbKey(i);
+
+            char *results = setToSeqReader.getData(i, thread_idx);
+
+            // process a specific set
+            while (*results != '\0') {
+                Util::getWordsOfLine(results, entry, 255);
+                unsigned int seqKey = Util::fast_atoi<unsigned int>(entry[0]);
+
+                char *seqToTaxData = taxSeqReader.getDataByDBKey(seqKey, thread_idx);
+                Util::getWordsOfLine(seqToTaxData, entry, 255);
+                TaxID taxon = Util::fast_atoi<int>(entry[0]);
+
+                setTaxa.emplace_back(taxon);
+                results = Util::skipLine(results);
+            }
+
+            // aggregate
+            TaxID setSelectedTaxon = selectTaxForSet(setTaxa, t, par.majorityThr);
+            TaxonNode const * node = t->taxonNode(setSelectedTaxon, false);
+            
+            // prepare write
+            if ((setSelectedTaxon == 0) || (node == NULL)) {
+                setTaxStr = "0\tno rank\tunclassified";
+                if (!ranks.empty()) {
+                    setTaxStr += '\t';
+                }
+                if (par.showTaxLineage) {
+                    setTaxStr += '\t';
+                }
+            } else {
+                setTaxStr = SSTR(node->taxId) + '\t' + node->rank + '\t' + node->name;
+                if (!ranks.empty()) {
+                    std::string lcaRanks = Util::implode(t->AtRanks(node, ranks), ';');
+                    setTaxStr += '\t' + lcaRanks;
+                }
+                if (par.showTaxLineage) {
+                    setTaxStr += '\t' + t->taxLineage(node);
+                }
+            }
+            setTaxStr += '\n';
+
+            writer.writeData(setTaxStr.c_str(), setTaxStr.size(), setKey, thread_idx);
+            setTaxStr.clear();
+
+            // ready to move to the next set
+            setTaxa.clear();
+        }
+    };
+    Debug(Debug::INFO) << "\n";
+
+    writer.close();
+    taxSeqReader.close();
+    setToSeqReader.close();
+    delete t;
+
+    return EXIT_SUCCESS;
+}
diff --git a/src/taxonomy/createtaxdb.cpp b/src/taxonomy/createtaxdb.cpp
index e4e033e..1c0d440 100644
--- a/src/taxonomy/createtaxdb.cpp
+++ b/src/taxonomy/createtaxdb.cpp
@@ -25,14 +25,14 @@ int createtaxdb(int argc, const char **argv, const Command& command) {
     CommandCaller cmd;
 
     cmd.addVariable("TMP_PATH", tmp.c_str());
-    if(par.taxMappingFile.size() == 0){
+    if (par.taxMappingFile.empty()) {
         cmd.addVariable("DOWNLOAD_MAPPING", "1");
     }else{
         cmd.addVariable("DOWNLOAD_MAPPING", "0");
         cmd.addVariable("MAPPINGFILE", par.taxMappingFile.c_str());
 
     }
-    if(par.ncbiTaxDump.size() == 0){
+    if (par.ncbiTaxDump.empty()) {
         cmd.addVariable("DOWNLOAD_NCBITAXDUMP", "1");
     }else{
         cmd.addVariable("DOWNLOAD_NCBITAXDUMP", "0");
diff --git a/src/taxonomy/filtertaxdb.cpp b/src/taxonomy/filtertaxdb.cpp
index 8d4f0dc..913a86a 100644
--- a/src/taxonomy/filtertaxdb.cpp
+++ b/src/taxonomy/filtertaxdb.cpp
@@ -22,11 +22,8 @@ int filtertaxdb(int argc, const char **argv, const Command& command) {
     DBWriter writer(par.db3.c_str(), par.db3Index.c_str(), par.threads, par.compressed, reader.getDbtype());
     writer.open();
 
-    std::vector<std::string> ranks = Util::split(par.lcaRanks, ":");
-
     // a few NCBI taxa are blacklisted by default, they contain unclassified sequences (e.g. metagenomes) or other sequences (e.g. plasmids)
     // if we do not remove those, a lot of sequences would be classified as Root, even though they have a sensible LCA
-    TaxonomyExpression taxonomyExpression(par.taxonList);
 
     Debug::Progress progress(reader.getSize());
 
@@ -37,8 +34,8 @@ int filtertaxdb(int argc, const char **argv, const Command& command) {
 #ifdef OPENMP
         thread_idx = (unsigned int) omp_get_thread_num();
 #endif
-
         const char *entry[255];
+        TaxonomyExpression taxonomyExpression(par.taxonList, *t);
 
         #pragma omp for schedule(dynamic, 10)
         for (size_t i = 0; i < reader.getSize(); ++i) {
@@ -55,7 +52,6 @@ int filtertaxdb(int argc, const char **argv, const Command& command) {
             std::vector<int> taxa;
             while (*data != '\0') {
                 unsigned int taxon;
-                bool isAncestor;
                 const size_t columns = Util::getWordsOfLine(data, entry, 255);
                 if (columns == 0) {
                     Debug(Debug::WARNING) << "Empty entry: " << i << "!";
@@ -65,8 +61,7 @@ int filtertaxdb(int argc, const char **argv, const Command& command) {
                 taxon = Util::fast_atoi<unsigned int>(entry[0]);
                 writer.writeStart(thread_idx);
 
-                isAncestor = (taxonomyExpression.isAncestorOf(*t, taxon) != -1);
-                if (isAncestor) {
+                if (taxonomyExpression.isAncestor(taxon)) {
                     char * nextData = Util::skipLine(data);
                     size_t dataSize = nextData - data;
                     writer.writeAdd(data, dataSize, thread_idx);
diff --git a/src/taxonomy/filtertaxseqdb.cpp b/src/taxonomy/filtertaxseqdb.cpp
new file mode 100644
index 0000000..9a7fa68
--- /dev/null
+++ b/src/taxonomy/filtertaxseqdb.cpp
@@ -0,0 +1,112 @@
+#include "NcbiTaxonomy.h"
+#include "Parameters.h"
+#include "DBWriter.h"
+#include "FileUtil.h"
+#include "Debug.h"
+#include "Util.h"
+#include "TaxonomyExpression.h"
+#include <map>
+#include <algorithm>
+
+#ifdef OPENMP
+#include <omp.h>
+#endif
+
+static bool compareToFirstInt(const std::pair<unsigned int, unsigned int>& lhs, const std::pair<unsigned int, unsigned int>& rhs){
+    return (lhs.first <= rhs.first);
+}
+
+int filtertaxseqdb(int argc, const char **argv, const Command& command) {
+    Parameters& par = Parameters::getInstance();
+    par.parseParameters(argc, argv, command, true, 0, 0);
+    
+    // open mapping (dbKey to taxid)
+    std::vector<std::pair<unsigned int, unsigned int>> mapping;
+    if (FileUtil::fileExists(std::string(par.db1 + "_mapping").c_str()) == false) {
+        Debug(Debug::ERROR) << par.db1 + "_mapping" << " does not exist. Please create the taxonomy mapping!\n";
+        EXIT(EXIT_FAILURE);
+    }
+    bool isSorted = Util::readMapping(par.db1 + "_mapping", mapping);
+    if (isSorted == false) {
+        std::stable_sort(mapping.begin(), mapping.end(), compareToFirstInt);
+    }
+
+    // open taxonomy - evolutionary relationships amongst taxa
+    NcbiTaxonomy * t = NcbiTaxonomy::openTaxonomy(par.db1);
+    
+    DBReader<unsigned int> reader(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
+    reader.open(DBReader<unsigned int>::LINEAR_ACCCESS);
+    const bool isCompressed = reader.isCompressed();
+
+    DBWriter writer(par.db2.c_str(), par.db2Index.c_str(), par.threads, 0, Parameters::DBTYPE_OMIT_FILE);
+    writer.open();
+
+    // a few NCBI taxa are blacklisted by default, they contain unclassified sequences (e.g. metagenomes) or other sequences (e.g. plasmids)
+    // if we do not remove those, a lot of sequences would be classified as Root, even though they have a sensible LCA
+
+    Debug::Progress progress(reader.getSize());
+
+    Debug(Debug::INFO) << "Computing LCA\n";
+    #pragma omp parallel
+    {
+        unsigned int thread_idx = 0;
+#ifdef OPENMP
+        thread_idx = (unsigned int) omp_get_thread_num();
+#endif
+        TaxonomyExpression taxonomyExpression(par.taxonList, *t);
+        #pragma omp for schedule(dynamic, 10)
+        for (size_t i = 0; i < reader.getSize(); ++i) {
+            progress.updateProgress();
+
+            unsigned int key = reader.getDbKey(i);
+            size_t offset = reader.getOffset(i);
+            size_t length = reader.getEntryLen(i);
+            unsigned int taxon = 0;
+
+            // match dbKey to its taxon based on mapping
+            std::pair<unsigned int, unsigned int> val;
+            val.first = key;
+            std::vector<std::pair<unsigned int, unsigned int>>::iterator mappingIt;
+            mappingIt = std::upper_bound(mapping.begin(), mapping.end(), val, compareToFirstInt);
+            if (mappingIt == mapping.end() || mappingIt->first != val.first) {
+                taxon = 0;
+            } else {
+                taxon = mappingIt->second;
+            }
+
+            // if taxon is an ancestor of the requested taxid, it will be retained
+            if (taxonomyExpression.isAncestor(taxon)) {
+                if (par.subDbMode == Parameters::SUBDB_MODE_SOFT) {
+                    writer.writeIndexEntry(key, offset, length, thread_idx);
+                } else {
+                    char* data = reader.getDataUncompressed(i);
+                    size_t originalLength = reader.getEntryLen(i);
+                    size_t entryLength = std::max(originalLength, static_cast<size_t>(1)) - 1;
+
+                    if (isCompressed) {
+                        // copy also the null byte since it contains the information if compressed or not
+                        entryLength = *(reinterpret_cast<unsigned int *>(data)) + sizeof(unsigned int) + 1;
+                        writer.writeData(data, entryLength, key, thread_idx, false, false);
+                    } else {
+                        writer.writeData(data, entryLength, key, thread_idx, true, false);
+                    }
+                    writer.writeIndexEntry(key, writer.getStart(thread_idx), originalLength, thread_idx);
+                }
+            }
+        }
+    };
+    Debug(Debug::INFO) << "\n";
+
+    writer.close(true);
+    if (par.subDbMode == Parameters::SUBDB_MODE_SOFT) {
+        DBReader<unsigned int>::softlinkDb(par.db1, par.db2, DBFiles::SEQUENCE_NO_DATA_INDEX);
+    } else {
+        DBWriter::writeDbtypeFile(par.db2.c_str(), reader.getDbtype(), isCompressed);
+        DBReader<unsigned int>::softlinkDb(par.db1, par.db2, DBFiles::SEQUENCE_ANCILLARY);
+    }
+
+    reader.close();
+    delete t;
+
+    return EXIT_SUCCESS;
+}
diff --git a/src/taxonomy/lca.cpp b/src/taxonomy/lca.cpp
index ff02835..38f2d76 100644
--- a/src/taxonomy/lca.cpp
+++ b/src/taxonomy/lca.cpp
@@ -35,7 +35,7 @@ int lca(int argc, const char **argv, const Command& command) {
     DBWriter writer(par.db3.c_str(), par.db3Index.c_str(), par.threads, par.compressed, Parameters::DBTYPE_TAXONOMICAL_RESULT);
     writer.open();
 
-    std::vector<std::string> ranks = Util::split(par.lcaRanks, ";");
+    std::vector<std::string> ranks = NcbiTaxonomy::parseRanks(par.lcaRanks);
 
     // a few NCBI taxa are blacklisted by default, they contain unclassified sequences (e.g. metagenomes) or other sequences (e.g. plasmids)
     // if we do not remove those, a lot of sequences would be classified as Root, even though they have a sensible LCA
@@ -49,11 +49,21 @@ int lca(int argc, const char **argv, const Command& command) {
     size_t taxonNotFound = 0;
     size_t found = 0;
 
+    // will be used when no hits
+    std::string noTaxResult = "0\tno rank\tunclassified";
+    if (!ranks.empty()) {
+        noTaxResult += '\t';
+    }
+    if (par.showTaxLineage) {
+        noTaxResult += '\t';
+    }
+    noTaxResult += '\n';
+
+
     Debug(Debug::INFO) << "Computing LCA\n";
     #pragma omp parallel
     {
         const char *entry[255];
-        char buffer[1024];
         std::string resultData;
         resultData.reserve(4096);
         unsigned int thread_idx = 0;
@@ -110,20 +120,17 @@ int lca(int argc, const char **argv, const Command& command) {
                 data = Util::skipLine(data);
             }
 
-            if(length == 1){
-                snprintf(buffer, 1024, "0\tno rank\tunclassified\n");
-                writer.writeData(buffer, strlen(buffer), key, thread_idx);
+            if (length == 1) {
+                writer.writeData(noTaxResult.c_str(), noTaxResult.size(), key, thread_idx);
                 continue;
             }
 
             TaxonNode const * node = t->LCA(taxa);
             if (node == NULL) {
-                snprintf(buffer, 1024, "0\tno rank\tunclassified\n");
-                writer.writeData(buffer, strlen(buffer), key, thread_idx);
+                writer.writeData(noTaxResult.c_str(), noTaxResult.size(), key, thread_idx);
                 continue;
             }
 
-
             resultData = SSTR(node->taxId) + '\t' + node->rank + '\t' + node->name;
             if (!ranks.empty()) {
                 std::string lcaRanks = Util::implode(t->AtRanks(node, ranks), ';');
@@ -138,7 +145,7 @@ int lca(int argc, const char **argv, const Command& command) {
         }
     };
     Debug(Debug::INFO) << "\n";
-    Debug(Debug::INFO) << "Taxonomy for " << taxonNotFound << " entries not found out of " << taxonNotFound+found << "\n";
+    Debug(Debug::INFO) << "Taxonomy for " << taxonNotFound << " out of " << taxonNotFound+found << " entries not found\n";
     writer.close();
     reader.close();
     delete t;
diff --git a/src/taxonomy/taxonomyreport.cpp b/src/taxonomy/taxonomyreport.cpp
index 3c1cc8b..a781811 100644
--- a/src/taxonomy/taxonomyreport.cpp
+++ b/src/taxonomy/taxonomyreport.cpp
@@ -188,7 +188,8 @@ int taxonomyreport(int argc, const char **argv, const Command& command) {
     };
     Debug(Debug::INFO) << "\n";
     Debug(Debug::INFO) << "Found " << taxCounts.size() << " different taxa for " << reader.getSize() << " different reads.\n";
-    Debug(Debug::INFO) << taxCounts.at(0) << " reads are unclassified.\n";
+    unsigned int unknownCnt = (taxCounts.find(0) != taxCounts.end()) ? taxCounts.at(0) : 0;
+    Debug(Debug::INFO) << unknownCnt << " reads are unclassified.\n";
 
     std::unordered_map<TaxID, TaxonCounts> cladeCounts = taxDB->getCladeCounts(taxCounts);
     if (par.reportMode == 0) {
diff --git a/src/test/TestAlignment.cpp b/src/test/TestAlignment.cpp
index 977b57b..78ebefb 100644
--- a/src/test/TestAlignment.cpp
+++ b/src/test/TestAlignment.cpp
@@ -33,8 +33,8 @@ int main (int, const char**) {
 
     SubstitutionMatrix subMat("blosum62.out", 2.0, -0.0f);
     std::cout << "Subustitution matrix:\n";
-    SubstitutionMatrix::print(subMat.subMatrix,subMat.int2aa,subMat.alphabetSize);
-    SubstitutionMatrix::print(subMat.subMatrix,subMat.int2aa,subMat.alphabetSize);
+    SubstitutionMatrix::print(subMat.subMatrix,subMat.num2aa,subMat.alphabetSize);
+    SubstitutionMatrix::print(subMat.subMatrix,subMat.num2aa,subMat.alphabetSize);
 //    for(int i = 0; i < 255; i++){
 //        std::cout << i << "\t" << MathUtil::convertCharToFloat(i) << std::endl;
 //    }
@@ -53,8 +53,8 @@ int main (int, const char**) {
 //			          "GWLKTHVSDAVAVQSRIIYGGSVTGGNCKELASQHDVDGFLVGGASLKPEFVDIINAKH";
 //    std::string tim1 = "LAEVGDARSLLEDDLVDLPDARFFKAMGREFVKLMLQGEASEAIKAPRAAAAVLPKQYTRDEDGDGVNLVLLVERVLEVPDECRLYIIGVAARVAGATVVYATGSRKKDAALPIANDETHLTAVLAKGESLPPPPENPMSADRVRWEHIQRIYEMCDRNVSETARRLNMHRRTLQRILAKRSPR";
 //    std::string tim2 = "EMDLAFVELGADRSLLLVDDDEPFLKRLAKAMEKRGFVLETAQSVAEGKAIAQARPPAYAVVDLRLEDGNGLDVVEVLRERRPDCRIVVLTGYGAIATAVAAVKIGATDYLSKPADANEVTHALLAKGESLPPPPENPMSADRVRWEHIQRIYEMCDRNVSETARRLNMHRRTLQRILAKRSPR";
-    std::string tim1 = "'GLTVDCVVFGLDEQIDLKVLLIQRQIPPFQHQWALPGGFVQMDESLEDAARRELREETGVQGIFLEQLYTFGDLGRDPRDRIISVAYYALINLIEYPLQASTDAEDAAWYSIENLPSLAFDHAQILKQAI";
-    std::string tim2 = "'GLTADVVILYNGGIVLIKRKHEPFKDHYALPGGFVEYGETVEEAALREAKEETGLDVRLIRLVGVYSDPNRDPRGHTVTTAFLAIGTGKLKAGDDAEEVHVVPVEEALKLPLAFDHAKILRDAL";
+    std::string tim1 = "'AAAGGTGACCGGGCACGGTGGCCCATGCCTATAATCCCAGCACTTTGGGAGGCCCAGGCAGGTGGATCACTTGAGGTCAGGAGTTCGAGACCAGCCTGGC";
+    std::string tim2 = "'GATTGAAAAACTCCCAGGCTGGACACGGTGGCCCATGCCTGTAATCCCAGCACTCTGGGAGGCTGAGGTGGGCTGATCCCTTGAGGTCAGGAGTTCGAGACCATCCTGGAAAATGTGGCA";
 
     std::cout << "Sequence (id 0):\n";
     //const char* sequence = read_seq;
@@ -80,14 +80,14 @@ int main (int, const char**) {
         sum += subMat.subMatrix[i][i];
     }
     std::cout << "Test: " << sum/ subMat.alphabetSize << std::endl;
-    aligner.ssw_init(s, tinySubMat, &subMat, subMat.alphabetSize, 2);
+    aligner.ssw_init(s, tinySubMat, &subMat, 2);
     int32_t maskLen = s->L / 2;
     int gap_open = 11;
     int gap_extend = 1;
     float seqId = 1.0;
     int aaIds = 0;
     EvalueComputation evalueComputation(100000, &subMat, gap_open, gap_extend);
-    s_align alignment = aligner.ssw_align(dbSeq->int_sequence, dbSeq->L, gap_open, gap_extend, 2, 10000, &evalueComputation, 0, 0.0, maskLen);
+    s_align alignment = aligner.ssw_align(dbSeq->numSequence, dbSeq->L, gap_open, gap_extend, 2, 10000, &evalueComputation, 0, 0.0, maskLen);
     if(alignment.cigar){
         std::cout << "Cigar" << std::endl;
 
@@ -97,27 +97,27 @@ int main (int, const char**) {
             uint32_t length = SmithWaterman::cigar_int_to_len(alignment.cigar[c]);
             for (uint32_t i = 0; i < length; ++i){
                 if (letter == 'M') {
-                    fprintf(stdout,"%c",subMat.int2aa[s->int_sequence[queryPos]]);
+                    fprintf(stdout,"%c",subMat.num2aa[s->numSequence[queryPos]]);
 
-                    if (dbSeq->int_sequence[targetPos] == s->int_sequence[queryPos]){
+                    if (dbSeq->numSequence[targetPos] == s->numSequence[queryPos]){
                         fprintf(stdout, "|");
                         aaIds++;
                     }
                     else fprintf(stdout, "*");
-                    fprintf(stdout,"%c",subMat.int2aa[dbSeq->int_sequence[targetPos]]);
+                    fprintf(stdout,"%c",subMat.num2aa[dbSeq->numSequence[targetPos]]);
 
                     ++queryPos;
                     ++targetPos;
                 } else {
                     if (letter == 'I'){
-                        fprintf(stdout,"%c",subMat.int2aa[s->int_sequence[queryPos]]);
+                        fprintf(stdout,"%c",subMat.num2aa[s->numSequence[queryPos]]);
                         fprintf(stdout, " ");
                         fprintf(stdout, "|");
                         ++queryPos;
                     }else{
                         fprintf(stdout, "|");
                         fprintf(stdout, " ");
-                        fprintf(stdout,"%c",subMat.int2aa[dbSeq->int_sequence[targetPos]]);
+                        fprintf(stdout,"%c",subMat.num2aa[dbSeq->numSequence[targetPos]]);
                         ++targetPos;
                     };
                 }
diff --git a/src/test/TestAlignmentPerformance.cpp b/src/test/TestAlignmentPerformance.cpp
index 1aaebc9..6d78a8c 100644
--- a/src/test/TestAlignmentPerformance.cpp
+++ b/src/test/TestAlignmentPerformance.cpp
@@ -57,7 +57,7 @@ int main (int, const char**) {
     Parameters& par = Parameters::getInstance();
     SubstitutionMatrix subMat(par.scoringMatrixFile.aminoacids, 2.0, 0);
     std::cout << "Subustitution matrix:\n";
-    SubstitutionMatrix::print(subMat.subMatrix,subMat.int2aa,subMat.alphabetSize);
+    SubstitutionMatrix::print(subMat.subMatrix,subMat.num2aa,subMat.alphabetSize);
     std::cout << "\n";
 
     std::cout << "subMatrix:\n";
@@ -86,13 +86,13 @@ int main (int, const char**) {
     std::vector<std::string> sequences = readData("/Users/mad/Documents/databases/rfam/Rfam.fasta");
     for(size_t seq_i = 0; seq_i < sequences.size(); seq_i++){
         query->mapSequence(1,1,sequences[seq_i].c_str(), sequences[seq_i].size());
-        aligner.ssw_init(query, tinySubMat, &subMat, subMat.alphabetSize, 2);
+        aligner.ssw_init(query, tinySubMat, &subMat, 2);
 
         for(size_t seq_j = 0; seq_j < sequences.size(); seq_j++) {
             dbSeq->mapSequence(2, 2, sequences[seq_j].c_str(),  sequences[seq_j].size());
             int32_t maskLen = query->L / 2;
             EvalueComputation evalueComputation(100000, &subMat, gap_open, gap_extend);
-            s_align alignment = aligner.ssw_align(dbSeq->int_sequence, dbSeq->L, gap_open, gap_extend, 0, 10000, &evalueComputation, 0, 0.0, maskLen);
+            s_align alignment = aligner.ssw_align(dbSeq->numSequence, dbSeq->L, gap_open, gap_extend, 0, 10000, &evalueComputation, 0, 0.0, maskLen);
             if(mode == 0 ){
                 cells += query->L * dbSeq->L;
                 std::cout << alignment.qEndPos1 << " " << alignment.dbEndPos1 << "\n";
diff --git a/src/test/TestAlignmentTraceback.cpp b/src/test/TestAlignmentTraceback.cpp
index e30fa34..6623028 100644
--- a/src/test/TestAlignmentTraceback.cpp
+++ b/src/test/TestAlignmentTraceback.cpp
@@ -29,7 +29,7 @@ scores workspace[10000*2 + 2];
 unsigned char bt[10000*10000];
 
 void sw(
-        const int *db_sequence, const int *query_sequence,
+        const unsigned char *db_sequence, const unsigned char *query_sequence,
         const short ** profile_word,
         int32_t query_start, int32_t query_end,
         int32_t target_start, int32_t target_end,
@@ -97,13 +97,13 @@ void sw(
         switch (state) {
             case M: // current state is MM, previous state is bMM[i][j]
                 matched_cols++;
-                fprintf(stdout,"%c", subMat.int2aa[db_sequence[target_start+i]]);
+                fprintf(stdout,"%c", subMat.num2aa[db_sequence[target_start+i]]);
                 if(query_sequence[query_start + j] == db_sequence[target_start + i]){
                     fprintf(stdout, "|");
                 }else{
                     fprintf(stdout, "*");
                 }
-                fprintf(stdout,"%c", subMat.int2aa[query_sequence[query_start+j]]);
+                fprintf(stdout,"%c", subMat.num2aa[query_sequence[query_start+j]]);
                 i--; j--;
                 state = (i < 0 || j < 0) ? -1 : get_val(bt, i, j);
                 break;
@@ -140,7 +140,7 @@ int main (int, const char**) {
     Parameters& par = Parameters::getInstance();
     SubstitutionMatrix subMat(par.scoringMatrixFile.aminoacids, 2.0, 0.0);
     std::cout << "Subustitution matrix:\n";
-    SubstitutionMatrix::print(subMat.subMatrix,subMat.int2aa,subMat.alphabetSize);
+    SubstitutionMatrix::print(subMat.subMatrix,subMat.num2aa,subMat.alphabetSize);
 
 
     //   BaseMatrix::print(subMat.subMatrix, subMat.alphabetSize);
@@ -182,11 +182,11 @@ int main (int, const char**) {
     for(int i = 0; i< subMat.alphabetSize; i++) {
         profile[i] = &profile_data[i*s->L];
         for (int j = 0; j < s->L; j++) {
-            profile[i][j] = tinySubMat[i*subMat.alphabetSize + s->int_sequence[j]];
+            profile[i][j] = tinySubMat[i*subMat.alphabetSize + s->numSequence[j]];
         }
     }
 
-    sw(dbSeq->int_sequence, s->int_sequence, (const short ** ) profile, 92, 157, 80, 146, 11, 1, subMat);
+    sw(dbSeq->numSequence, s->numSequence, (const short ** ) profile, 92, 157, 80, 146, 11, 1, subMat);
     // calcuate stop score
 
     delete [] tinySubMat;
diff --git a/src/test/TestBestAlphabet.cpp b/src/test/TestBestAlphabet.cpp
index 730c36c..fb432c5 100644
--- a/src/test/TestBestAlphabet.cpp
+++ b/src/test/TestBestAlphabet.cpp
@@ -66,7 +66,7 @@ int main (int, const char**) {
     for(size_t a = 0; a < numAAs; a++)
         seqid += subMat.probMatrix[a][a];
     printf("Substitution matrix with average sequence identity %4.1f%%:\n",100*seqid);
-    SubstitutionMatrix::print(subMat.subMatrix,subMat.int2aa,subMat.alphabetSize);
+    SubstitutionMatrix::print(subMat.subMatrix,subMat.num2aa,subMat.alphabetSize);
 
 
     //////////////////////////////////////////////////////////////////////////////////////////////
@@ -90,7 +90,7 @@ int main (int, const char**) {
         unsigned int dbKey = seqDb.getDbKey(id);
         rseqKmer.mapSequence(id, dbKey, seqData, seqDb.getSeqLen(id));
         while (rseqKmer.hasNextKmer() && sumKmerCnts < 20000*numKmers) {
-            const int* kmer = rseqKmer.nextKmer();
+            const unsigned char* kmer = rseqKmer.nextKmer();
 
             // Ignore k-mers containing an X (which is encoded by numAAs)
             unsigned pos = 0;
diff --git a/src/test/TestCompositionBias.cpp b/src/test/TestCompositionBias.cpp
index 0675310..b28a0a0 100644
--- a/src/test/TestCompositionBias.cpp
+++ b/src/test/TestCompositionBias.cpp
@@ -20,9 +20,9 @@ void calcLocalAaBiasCorrection(Sequence* seq, SubstitutionMatrix * m){
         const int _2d = maxPos - minPos;
         // negative score for the amino acids in the neighborhood of i
         int sumSubScores = 0;
-        short * subMat = m->subMatrix[seq->int_sequence[i]];
+        short * subMat = m->subMatrix[seq->numSequence[i]];
         for (int j = minPos; j < maxPos; j++){
-            sumSubScores += (j != i) ? subMat[seq->int_sequence[j]] : 0;
+            sumSubScores += (j != i) ? subMat[seq->numSequence[j]] : 0;
         }
         float deltaS_i = (float) sumSubScores;
         deltaS_i /= -1.0 * _2d;
@@ -53,7 +53,7 @@ int main (int, const char**) {
     Parameters& par = Parameters::getInstance();
     SubstitutionMatrix subMat(par.scoringMatrixFile.aminoacids, 8.0, 0);
     std::cout << "Substitution matrix:";
-    SubstitutionMatrix::print(subMat.subMatrix, subMat.int2aa, subMat.alphabetSize);
+    SubstitutionMatrix::print(subMat.subMatrix, subMat.num2aa, subMat.alphabetSize);
 
     const char *ref = "MDDVKIERLKRLNEDVLEDLIEVYMRGYEGLEEYGGEGRDYARDYIKWCWKKAPDGFFVAKVGDRIVGFIVCDRDWYSRYEGKIVGAIHEFVVDKGWQGKGIGKKLLTKCLEFLGKYNDTIELWVGEKNFGAMRLYEKFGFKKVGKSGIWIRMVRRQLS";
     Sequence refSeq(10000, 0, &subMat, kmer_size, false, true);
diff --git a/src/test/TestDiagonalScoring.cpp b/src/test/TestDiagonalScoring.cpp
index ecd4501..68b054b 100644
--- a/src/test/TestDiagonalScoring.cpp
+++ b/src/test/TestDiagonalScoring.cpp
@@ -19,7 +19,7 @@ int main (int, const char**) {
     size_t kmer_size = 6;
     Parameters& par = Parameters::getInstance();
     SubstitutionMatrix subMat(par.scoringMatrixFile.aminoacids, 8.0, -0.2);
-    SubstitutionMatrix::print(subMat.subMatrix,subMat.int2aa,subMat.alphabetSize);
+    SubstitutionMatrix::print(subMat.subMatrix,subMat.num2aa,subMat.alphabetSize);
 
     std::string S1 = "PQITLWQG";
     const char* S1char = S1.c_str();
@@ -93,7 +93,7 @@ int main (int, const char**) {
     CounterResult hits[32];
     UngappedAlignment matcher(10000, &subMat, &lookup);
 
-    SubstitutionMatrix::calcLocalAaBiasCorrection(&subMat, s5.int_sequence, s5.L, compositionBias);
+    SubstitutionMatrix::calcLocalAaBiasCorrection(&subMat, s5.numSequence, s5.L, compositionBias);
     memset(compositionBias, 0.0, sizeof(float)*s5.L);
 //    std::cout << compositionBias[74] << std::endl;
 //    std::cout << compositionBias[79] << std::endl;
@@ -115,80 +115,80 @@ int main (int, const char**) {
 
 
 
-    SubstitutionMatrix::calcLocalAaBiasCorrection(&subMat, s1.int_sequence, s1.L, compositionBias);
+    SubstitutionMatrix::calcLocalAaBiasCorrection(&subMat, s1.numSequence, s1.L, compositionBias);
 
     hits[0].id = s1.getId();
     hits[0].diagonal = 0;
     matcher.processQuery(&s1, compositionBias, hits, 1);
-    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.int_sequence, s1.int_sequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
+    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
 
     for(int i = 0; i < 16; i++){
         hits[i].id = s1.getId();
         hits[i].diagonal = 0;
     }
     matcher.processQuery(&s1, compositionBias, hits, 16);
-    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.int_sequence, s1.int_sequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
+    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
 
 
     hits[0].id = s1.getId();
     hits[0].diagonal = 9;
     matcher.processQuery(&s2, compositionBias, hits, 1);
-    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.int_sequence, s1.int_sequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
+    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
 
     for(int i = 0; i < 16; i++){
         hits[i].id = s1.getId();
         hits[i].diagonal = 9;
     }
     matcher.processQuery(&s2, compositionBias, hits, 16);
-    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.int_sequence, s1.int_sequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
+    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
 
     for(int i = 0; i < 16; i++){
         hits[i].id = s2.getId();
         hits[i].diagonal = -9;
     }
     matcher.processQuery(&s1, compositionBias, hits, 16);
-    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.int_sequence, s1.int_sequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
+    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
 
     matcher.processQuery(&s1, compositionBias, hits, 1);
-    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.int_sequence, s1.int_sequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
+    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
 
     for(int i = 0; i < 16; i++){
         hits[i].id = s2.getId();
         hits[i].diagonal = -9;
     }
     matcher.processQuery(&s3, compositionBias, hits, 16);
-    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.int_sequence, s1.int_sequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
+    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
 
     matcher.processQuery(&s3, compositionBias, hits, 1);
-    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.int_sequence, s1.int_sequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
+    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
 
 
     hits[0].id = s4.getId();
     hits[0].diagonal = -256;
     matcher.processQuery(&s1, compositionBias, hits, 1);
-    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.int_sequence, s1.int_sequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
+    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
 
 
     hits[0].id = s1.getId();
     hits[0].diagonal = 256;
     matcher.processQuery(&s4,compositionBias, hits, 1);
-    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.int_sequence, s1.int_sequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
+    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
 
     hits[0].id = s7.getId();
     hits[0].diagonal = -512;
     matcher.processQuery(&s1,compositionBias, hits, 16);
-    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.int_sequence, s1.int_sequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
+    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
 
     hits[0].id = s1.getId();
     hits[0].diagonal = 512;
     matcher.processQuery(&s7,compositionBias, hits, 16);
-    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.int_sequence, s1.int_sequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
+    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
 
 
     hits[0].id = s7.getId();
     hits[0].diagonal = 0;
     matcher.processQuery(&s7, compositionBias, hits, 16);
-    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.int_sequence, s1.int_sequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
+    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count <<  std::endl;
 
     delete [] compositionBias;
 }
diff --git a/src/test/TestDiagonalScoringPerformance.cpp b/src/test/TestDiagonalScoringPerformance.cpp
index 8671a2e..4dbfaab 100644
--- a/src/test/TestDiagonalScoringPerformance.cpp
+++ b/src/test/TestDiagonalScoringPerformance.cpp
@@ -25,7 +25,7 @@ int main (int, const char**) {
     size_t kmer_size = 6;
     Parameters& par = Parameters::getInstance();
     SubstitutionMatrix subMat(par.scoringMatrixFile.aminoacids, 8.0, 0.0);
-    SubstitutionMatrix::print(subMat.subMatrix,subMat.int2aa,subMat.alphabetSize);
+    SubstitutionMatrix::print(subMat.subMatrix,subMat.num2aa,subMat.alphabetSize);
 
     std::string S1 = "AYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQTLGQHDFSAGEGLYTHMKALRPDEDRLSPLHSVYVDQWDWERVMGDGERQFSTLKSTVEAIWAGIKATEAAVSEEFGLAPFLPDQIHFVHSQELLSRYPDLDAKGRERAIAKDLGAVFLVGIGGKLSDGHRHDVRAPDYDDWSTPSELGHAGLNGDILVWNPVLEDAFELSSMGIRVDADTLKHQLALTGDEDRLELEWHQALLRGEMPQTIGGGIGQSRLTMLLLQLPHIGQVQAGVWPAAVRESVPSLL";
     const char* S1char = S1.c_str();
@@ -88,7 +88,7 @@ int main (int, const char**) {
     }
 
     float * compositionBias = new float[s1.L];
-    SubstitutionMatrix::calcLocalAaBiasCorrection(&subMat, s1.int_sequence, s1.L, compositionBias);
+    SubstitutionMatrix::calcLocalAaBiasCorrection(&subMat, s1.numSequence, s1.L, compositionBias);
 
 
 
@@ -115,7 +115,7 @@ int main (int, const char**) {
         //   std::reverse(hits, hits+1000);
         matcher.processQuery(&s1, compositionBias, hits, 16000);
     }
-//    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.int_sequence, s1.int_sequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].diagonalScore <<  std::endl;
+//    std::cout << ExtendedSubstitutionMatrix::calcScore(s1.sequence, s1.sequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].diagonalScore <<  std::endl;
 //    std::cout << (int)hits[0].diagonalScore <<  std::endl;
     for(int i = 0; i < 1000; i++){
         std::cout << hits[i].id << "\t" << (int) hits[i].diagonal  << "\t" << (int)hits[i].count <<  std::endl;
diff --git a/src/test/TestExtendedSubstitutionMatrix.cpp b/src/test/TestExtendedSubstitutionMatrix.cpp
index 6ca6147..ed06c8a 100644
--- a/src/test/TestExtendedSubstitutionMatrix.cpp
+++ b/src/test/TestExtendedSubstitutionMatrix.cpp
@@ -24,7 +24,7 @@ int main (int argc, const char * argv[])
     SubstitutionMatrix subMat("/Users/aluucard/Documents/workspace/kClust2/data/blosum30.out",8.0);
     
     for(int i = 0; i<subMat.alphabetSize;i++)
-        printf("%c\t",subMat.int2aa[i]);
+        printf("%c\t",subMat.num2aa[i]);
     printf("\n");
 //    ReducedMatrix redMat(subMat.probMatrix, subMat.alphabetSize-2);
     
@@ -39,21 +39,21 @@ int main (int argc, const char * argv[])
     char* sequence = "AAMICPAEAGRPSLADS";
     std::cout << sequence << "\n\n";
     
-    Sequence* s = new Sequence (10000, subMat.aa2int, subMat.int2aa, 0, kmer_size, false);
+    Sequence* s = new Sequence (10000, subMat.aa2num, subMat.num2aa, 0, kmer_size, false);
     s->mapSequence(0,"LALA",sequence);
     
     printf("Normal : ");
     for(int i = 0; i<subMat.alphabetSize;i++)
-        printf("%c\t",subMat.int2aa[i]);
+        printf("%c\t",subMat.num2aa[i]);
     printf("\nReduced: ");
     for(int i = 0; i<subMat.alphabetSize;i++)
-        printf("%c\t",subMat.int2aa[i]);
+        printf("%c\t",subMat.num2aa[i]);
     printf("\nNormal : ");
     for(int i = 65; i<'Z';i++)
-        printf("%d\t",subMat.aa2int[i]); 
+        printf("%d\t",subMat.aa2num[i]);
     printf("\nReduced: ");
     for(int i = 65; i<'Z';i++)
-        printf("%d\t",subMat.aa2int[i]); 
+        printf("%d\t",subMat.aa2num[i]);
     
     std::cout << "\nInt reduced sequence:\n";
     for (int i = 0; i < s->L; i++)
diff --git a/src/test/TestIndexTable.cpp b/src/test/TestIndexTable.cpp
index 8671484..3da5307 100644
--- a/src/test/TestIndexTable.cpp
+++ b/src/test/TestIndexTable.cpp
@@ -27,7 +27,7 @@ int main (int, const char**) {
     Sequence *s = new Sequence(32000, Parameters::DBTYPE_AMINO_ACIDS, &subMat, 6, true, false);
     IndexTable t(subMat.alphabetSize, 6, false);
     IndexBuilder::fillDatabase(&t, NULL, NULL, subMat, s, &dbr, 0, dbr.getSize(), 0, 1, 1);
-    t.printStatistics(subMat.int2aa);
+    t.printStatistics(subMat.num2aa);
 
     delete s;
     dbr.close();
diff --git a/src/test/TestKmerGenerator.cpp b/src/test/TestKmerGenerator.cpp
index dfe6d22..55947d6 100644
--- a/src/test/TestKmerGenerator.cpp
+++ b/src/test/TestKmerGenerator.cpp
@@ -48,12 +48,12 @@ int main (int, const char**) {
     size_t * testKmer = new size_t[kmer_size];
     int i = 0; 
     while(s->hasNextKmer()){
-        const int * curr_pos = s->nextKmer();
+        const unsigned char * curr_pos = s->nextKmer();
         printf("Pos1: %d\n", i++);
 
         unsigned int idx_val=idx.int2index(curr_pos);
         std::cout << "Index:    " <<idx_val << "  ";
-        idx.printKmer(idx_val, kmer_size, subMat.int2aa);
+        idx.printKmer(idx_val, kmer_size, subMat.num2aa);
         std::cout << std::endl;
 //        std::cout << "MaxScore: " << extMattwo.scoreMatrix[idx_val]->back().first<< "\n";
         std::pair<size_t *, size_t > kmer_list= kmerGen.generateKmerList(curr_pos);
@@ -71,7 +71,7 @@ int main (int, const char**) {
                 std::cout << testKmer[i] << " ";
             std::cout << "\t";
             for (size_t i = 0; i < kmer_size; i++)
-                std::cout << subMat.int2aa[testKmer[i]];
+                std::cout << subMat.num2aa[testKmer[i]];
             std::cout << "\n";
         }
     }
diff --git a/src/test/TestKmerGeneratorPerf.cpp b/src/test/TestKmerGeneratorPerf.cpp
index cb50e0e..c6b174d 100644
--- a/src/test/TestKmerGeneratorPerf.cpp
+++ b/src/test/TestKmerGeneratorPerf.cpp
@@ -39,15 +39,15 @@ int main (int argc, const char * argv[])
     char* sequence = (char *) argv[1];
     std::cout << sequence << "\n\n";
  
-    Sequence* s = new Sequence (10000, subMat.aa2int, subMat.int2aa,0);
+    Sequence* s = new Sequence (10000, subMat.aa2num, subMat.num2aa,0);
     s->mapSequence(0,"lala",sequence);
     
     printf("Normal alphabet : ");
     for(int i = 0; i<subMat.alphabetSize;i++)
-        printf("%c\t",subMat.int2aa[i]);
+        printf("%c\t",subMat.num2aa[i]);
     printf("\nNormal int code: ");
     for(int i = 'A'; i<'Z';i++)
-        printf("%d\t",subMat.aa2int[i]); 
+        printf("%d\t",subMat.aa2num[i]);
     
     
     std::cout << "\nInt reduced sequence:\n";
diff --git a/src/test/TestKmerGeneratorProfile.cpp b/src/test/TestKmerGeneratorProfile.cpp
index e03923d..0723331 100644
--- a/src/test/TestKmerGeneratorProfile.cpp
+++ b/src/test/TestKmerGeneratorProfile.cpp
@@ -54,7 +54,7 @@ int main (int argc, const char * argv[])
     
     const char* sequence = (const char *) string;
 
-    Sequence* s = new Sequence (10000, subMat.aa2int, subMat.int2aa, Sequence::HMM_PROFILE, &subMat);
+    Sequence* s = new Sequence (10000, subMat.aa2num, subMat.num2aa, Sequence::HMM_PROFILE, &subMat);
     s->mapSequence(0,"lala",sequence);
 
     KmerGenerator kmerGen(kmer_size,subMat.alphabetSize,90);
@@ -83,7 +83,7 @@ int main (int argc, const char * argv[])
                 std::cout << testKmer[i] << " ";
             std::cout << "\t";
             for (size_t i = 0; i < kmer_size; i++)
-                std::cout << subMat.int2aa[testKmer[i]];
+                std::cout << subMat.num2aa[testKmer[i]];
             std::cout << "\n";
         }
     }
diff --git a/src/test/TestKmerNucl.cpp b/src/test/TestKmerNucl.cpp
index efbd4d4..870e7dd 100644
--- a/src/test/TestKmerNucl.cpp
+++ b/src/test/TestKmerNucl.cpp
@@ -33,12 +33,12 @@ int main (int, const char**) {
 
     size_t i = 0;
     while (s->hasNextKmer()) {
-        const int* curr_pos = s->nextKmer();
+        const unsigned char* curr_pos = s->nextKmer();
         printf("Pos1: %zu\n", i++);
 
         size_t idx_val = idx.int2index(curr_pos);
         std::cout << "Index: " << idx_val << " ";
-        idx.printKmer(idx_val, kmer_size, subMat.int2aa);
+        idx.printKmer(idx_val, kmer_size, subMat.num2aa);
         std::cout << std::endl;
 
         std::string kmerStr1;
@@ -46,7 +46,7 @@ int main (int, const char**) {
         for(size_t kmerPos = 0; kmerPos < kmer_size; kmerPos++){
             kmerIdx = kmerIdx << 2;
             kmerIdx = kmerIdx | curr_pos[kmerPos];
-            kmerStr1.append(1, subMat.int2aa[curr_pos[kmerPos]]);
+            kmerStr1.append(1, subMat.num2aa[curr_pos[kmerPos]]);
         }
 
         std::string revStr1;
diff --git a/src/test/TestKmerScore.cpp b/src/test/TestKmerScore.cpp
index 1a3c3d6..bd945ea 100644
--- a/src/test/TestKmerScore.cpp
+++ b/src/test/TestKmerScore.cpp
@@ -15,7 +15,7 @@ int main (int, const char**) {
     Parameters& par = Parameters::getInstance();
     SubstitutionMatrix subMat(par.scoringMatrixFile.aminoacids, 8.0, 0);
     std::cout << "Subustitution matrix:\n";
-    SubstitutionMatrix::print(subMat.subMatrix, subMat.int2aa, subMat.alphabetSize);
+    SubstitutionMatrix::print(subMat.subMatrix, subMat.num2aa, subMat.alphabetSize);
 
     const char *ref = "GKILII";
     Sequence refSeq(1000,  0, &subMat, kmer_size, false, true);
@@ -28,7 +28,7 @@ int main (int, const char**) {
 
     short score = 0;
         for(size_t i = 0; i < kmer_size; i++){
-            score += subMat.subMatrix[refSeq.int_sequence[i]][similarSeq.int_sequence[i]];
+            score += subMat.subMatrix[refSeq.numSequence[i]][similarSeq.numSequence[i]];
         }
     std::cout << score << std::endl;
 
diff --git a/src/test/TestKsw2.cpp b/src/test/TestKsw2.cpp
index 551365c..a7b1e5e 100644
--- a/src/test/TestKsw2.cpp
+++ b/src/test/TestKsw2.cpp
@@ -332,7 +332,7 @@ int main (int, const char**) {
 //    short diagonal = 15-14;
 
     NucleotideMatrix subMat("blosum62.out", 2.0, -0.0f);
-    BandedNucleotideAligner aligner((BaseMatrix*)&subMat, 10000,  5, 1);
+    BandedNucleotideAligner aligner((BaseMatrix*)&subMat, 10000,  5, 1, 40);
     EvalueComputation evalueComputation(100000, &subMat, 7, 1);
     
     
@@ -360,27 +360,27 @@ int main (int, const char**) {
 //                uint32_t length = SmithWaterman::cigar_int_to_len(alignment.cigar[c]);
 //                for (uint32_t i = 0; i < length; ++i){
 //                    if (letter == 'M') {
-//                        queryAln.push_back(subMat.int2aa[queryObj->int_sequence[queryPos]]);
-//                        if (targetObj->int_sequence[targetPos] == queryObj->int_sequence[queryPos]){
+//                        queryAln.push_back(subMat.num2aa[queryObj->sequence[queryPos]]);
+//                        if (targetObj->sequence[targetPos] == queryObj->sequence[queryPos]){
 //                            middleAln.push_back('|');
 //                            aaIds++;
 //                        }
 //                        else {
 //                            middleAln.push_back('*');
 //                        }
-//                        targetAln.push_back(subMat.int2aa[targetObj->int_sequence[targetPos]]);
+//                        targetAln.push_back(subMat.num2aa[targetObj->sequence[targetPos]]);
 //                        ++queryPos;
 //                        ++targetPos;
 //                    } else {
 //                        if (letter == 'I'){
-//                            queryAln.push_back(subMat.int2aa[queryObj->int_sequence[queryPos]]);
+//                            queryAln.push_back(subMat.num2aa[queryObj->sequence[queryPos]]);
 //                            middleAln.push_back(' ');
 //                            targetAln.push_back('-');
 //                            ++queryPos;
 //                        }else{
 //                            queryAln.push_back('-');
 //                            middleAln.push_back(' ');
-//                            targetAln.push_back(subMat.int2aa[targetObj->int_sequence[targetPos]]);
+//                            targetAln.push_back(subMat.num2aa[targetObj->sequence[targetPos]]);
 //                            ++targetPos;
 //                        };
 //                    }
diff --git a/src/test/TestMultipleAlignment.cpp b/src/test/TestMultipleAlignment.cpp
index ab63ec0..456e07b 100644
--- a/src/test/TestMultipleAlignment.cpp
+++ b/src/test/TestMultipleAlignment.cpp
@@ -22,7 +22,7 @@ int main (int, const char**) {
 
     SubstitutionMatrix subMat(par.scoringMatrixFile.aminoacids, 2.0, 0);
     std::cout << "Subustitution matrix:\n";
-    SubstitutionMatrix::print(subMat.subMatrix,subMat.int2aa,subMat.alphabetSize);
+    SubstitutionMatrix::print(subMat.subMatrix,subMat.num2aa,subMat.alphabetSize);
     //   BaseMatrix::print(subMat.subMatrix, subMat.alphabetSize);
     std::cout << "\n";
 
@@ -62,15 +62,12 @@ int main (int, const char**) {
     seqSet.push_back(&s3);
     seqSet.push_back(&s4);
     //seqSet.push_back(s5);
-    EvalueComputation evaluer(100000, &subMat, par.gapOpen, par.gapExtend);
-    Matcher * aligner = new Matcher(Parameters::DBTYPE_AMINO_ACIDS, 10000, &subMat, &evaluer, false, par.gapOpen, par.gapExtend);
+    EvalueComputation evaluer(100000, &subMat, par.gapOpen.aminoacids, par.gapExtend.aminoacids);
+    Matcher * aligner = new Matcher(Parameters::DBTYPE_AMINO_ACIDS, 10000, &subMat, &evaluer, false, par.gapOpen.aminoacids, par.gapExtend.aminoacids);
     MultipleAlignment msaAligner(1000, 10, &subMat, aligner);
     MultipleAlignment::MSAResult res = msaAligner.computeMSA(&s1, seqSet, true);
-    MsaFilter filter(1000, 10000, &subMat, par.gapOpen, par.gapExtend);
-    size_t filterSetSize = res.setSize;
-    filter.filter(res.setSize, res.centerLength, 0, 0, -20.0, 50, 100,
-                     (const char**)res.msaSequence, &filterSetSize);
-    filter.shuffleSequences((const char**)res.msaSequence, res.setSize);
+    MsaFilter filter(1000, 10000, &subMat, par.gapOpen.aminoacids, par.gapExtend.aminoacids);
+    size_t filterSetSize = filter.filter(res, 0, 0, -20.0, 50, 100);
     std::cout << "Filtered:" << filterSetSize << std::endl;
     MultipleAlignment::print(res, &subMat);
     PSSMCalculator pssm(&subMat, 1000, 5, 1.0, 1.5);
diff --git a/src/test/TestPSSM.cpp b/src/test/TestPSSM.cpp
index 6ee8dbc..8e301ed 100644
--- a/src/test/TestPSSM.cpp
+++ b/src/test/TestPSSM.cpp
@@ -20,7 +20,7 @@ int main (int, const char**) {
     Parameters& par = Parameters::getInstance();
     SubstitutionMatrix subMat(par.scoringMatrixFile.aminoacids, 2.0, 0.0);
     std::cout << "Subustitution matrix:";
-    SubstitutionMatrix::print(subMat.subMatrix,subMat.int2aa,subMat.alphabetSize);
+    SubstitutionMatrix::print(subMat.subMatrix,subMat.num2aa,subMat.alphabetSize);
     //   BaseMatrix::print(subMat.subMatrix, subMat.alphabetSize);
     const char *seqs[1001];
     int counter = 0;
@@ -1577,19 +1577,16 @@ int main (int, const char**) {
     for (int k = 0; k < counter; ++k) {
         seqsCpy[k] = MultipleAlignment::initX(122);
         for (int pos = 0; pos < 122; ++pos) {
-//            seqs[k][pos] = (seqs[k][pos] == '-') ? MultipleAlignment::GAP : subMat.aa2int[(int) seqs[k][pos]];
-            seqsCpy[k][pos] = (seqs[k][pos] == '-') ? MultipleAlignment::GAP : subMat.aa2int[(int) seqs[k][pos]];
+//            seqs[k][pos] = (seqs[k][pos] == '-') ? MultipleAlignment::GAP : subMat.aa2num[(int) seqs[k][pos]];
+            seqsCpy[k][pos] = (seqs[k][pos] == '-') ? MultipleAlignment::GAP : static_cast<int>(subMat.aa2num[(int) seqs[k][pos]]);
         }
     }
 
     MultipleAlignment::MSAResult res(122, 122, counter, seqsCpy);
     MultipleAlignment::print(res, &subMat);
 
-    MsaFilter msaFilter(10000, counter, &subMat, par.gapOpen, par.gapExtend);
-    size_t filterSetSize = res.setSize;
-    msaFilter.filter(res.setSize, res.centerLength, 0, 0, -20.0f, 90, 100,
-                     (const char**)res.msaSequence, &filterSetSize);
-    msaFilter.shuffleSequences((const char**)res.msaSequence, res.setSize);
+    MsaFilter msaFilter(10000, counter, &subMat, par.gapOpen.aminoacids, par.gapExtend.aminoacids);
+    size_t filterSetSize = msaFilter.filter(res, 0, 0, -20.0f, 90, 100);
     std::cout << "Filtered:" << filterSetSize << std::endl;
 //    for(size_t k = 0; k < res.setSize; k++){
 //        std::cout << "k=" << k << "\t" << (int)filterResult.keep[k] << std::endl;
@@ -1599,7 +1596,7 @@ int main (int, const char**) {
         printf("k=%.3zu ", k);
         for (size_t pos = 0; pos < res.centerLength; pos++) {
             char aa = res.msaSequence[k][pos];
-            printf("%c", (aa < MultipleAlignment::NAA) ? subMat.int2aa[(int) aa] : '-');
+            printf("%c", (aa < MultipleAlignment::NAA) ? subMat.num2aa[(int) aa] : '-');
         }
         printf("\n");
     }
diff --git a/src/test/TestPSSMPrune.cpp b/src/test/TestPSSMPrune.cpp
index 1bcff1e..aadb13b 100644
--- a/src/test/TestPSSMPrune.cpp
+++ b/src/test/TestPSSMPrune.cpp
@@ -21,7 +21,7 @@ int main (int, const char**) {
     Parameters& par = Parameters::getInstance();
     SubstitutionMatrix subMat(par.scoringMatrixFile.aminoacids, 2.0, -0.0);
     std::cout << "Subustitution matrix:";
-    SubstitutionMatrix::print(subMat.subMatrix,subMat.int2aa,subMat.alphabetSize);
+    SubstitutionMatrix::print(subMat.subMatrix,subMat.num2aa,subMat.alphabetSize);
     //   BaseMatrix::print(subMat.subMatrix, subMat.alphabetSize);
     const char *seqs[1001];
     int counter = 0;
@@ -50,15 +50,15 @@ int main (int, const char**) {
     for (int k = 0; k < counter; ++k) {
         seqsCpy[k] = MultipleAlignment::initX(strlen(seqs[0]));
         for (size_t pos = 0; pos < strlen(seqs[0]); ++pos) {
-//            seqs[k][pos] = (seqs[k][pos] == '-') ? MultipleAlignment::GAP : subMat.aa2int[(int) seqs[k][pos]];
-            seqsCpy[k][pos] = (seqs[k][pos] == '-') ? MultipleAlignment::GAP : subMat.aa2int[(int) seqs[k][pos]];
+//            seqs[k][pos] = (seqs[k][pos] == '-') ? MultipleAlignment::GAP : subMat.aa2num[(int) seqs[k][pos]];
+            seqsCpy[k][pos] = (seqs[k][pos] == '-') ? MultipleAlignment::GAP : static_cast<int>(subMat.aa2num[(int) seqs[k][pos]]);
         }
     }
 
     MultipleAlignment::MSAResult res(strlen(seqs[0]), strlen(seqs[0]), counter, seqsCpy);
     MultipleAlignment::print(res, &subMat);
 
-    MsaFilter msaFilter(10000, counter, &subMat, par.gapOpen, par.gapExtend);
+    MsaFilter msaFilter(10000, counter, &subMat, par.gapOpen.aminoacids, par.gapExtend.aminoacids);
     msaFilter.pruneAlignment((char**)res.msaSequence, res.setSize, res.centerLength);
 
     std::cout <<"Pruned MSA" << std::endl;
@@ -66,14 +66,11 @@ int main (int, const char**) {
         //printf("k=%.3d ", k);
         for(size_t pos = 0; pos < res.centerLength; pos++){
             char aa = res.msaSequence[k][pos];
-            printf("%c", (aa < MultipleAlignment::NAA) ? subMat.int2aa[(int)aa] : '-' );
+            printf("%c", (aa < MultipleAlignment::NAA) ? subMat.num2aa[(int)aa] : '-' );
         }
         printf("\n");
     }
-    size_t filterSetSize = res.setSize;
-    msaFilter.filter(res.setSize, res.centerLength, 0, 0, -20.0f, 90, 100,
-                     (const char**)res.msaSequence, &filterSetSize);
-    msaFilter.shuffleSequences((const char**)res.msaSequence, res.setSize);
+    size_t filterSetSize = msaFilter.filter(res, 0, 0, -20.0f, 90, 100);
     std::cout << "Filtered:" << filterSetSize << std::endl;
 //    for(size_t k = 0; k < res.setSize; k++){
 //        std::cout << "k=" << k << "\t" << (int)filterResult.keep[k] << std::endl;
@@ -83,7 +80,7 @@ int main (int, const char**) {
         printf("k=%.3zu ", k);
         for (size_t pos = 0; pos < res.centerLength; pos++) {
             char aa = res.msaSequence[k][pos];
-            printf("%c", (aa < MultipleAlignment::NAA) ? subMat.int2aa[(int) aa] : '-');
+            printf("%c", (aa < MultipleAlignment::NAA) ? subMat.num2aa[(int) aa] : '-');
         }
         printf("\n");
     }
diff --git a/src/test/TestProfileAlignment.cpp b/src/test/TestProfileAlignment.cpp
index 6c2eb35..2e6f5a7 100644
--- a/src/test/TestProfileAlignment.cpp
+++ b/src/test/TestProfileAlignment.cpp
@@ -26,7 +26,7 @@ int main (int, const char**) {
     Parameters& par = Parameters::getInstance();
     SubstitutionMatrix subMat(par.scoringMatrixFile.aminoacids, 2.0, 0);
     std::cout << "Subustitution matrix:";
-    SubstitutionMatrix::print(subMat.subMatrix,subMat.int2aa,subMat.alphabetSize);
+    SubstitutionMatrix::print(subMat.subMatrix,subMat.num2aa,subMat.alphabetSize);
     //   BaseMatrix::print(subMat.subMatrix, subMat.alphabetSize);
     std::cout << "";
 
@@ -764,7 +764,7 @@ int main (int, const char**) {
         msaSequence[k] = new char[centerSeqSize];
         for (unsigned int pos = 0; pos < centerSeqSize; ++pos) {
             msaSequence[k][pos] = (msaSeq[k][pos] == '-') ?
-                                  21 : subMat.aa2int[(int) msaSeq[k][pos]];
+                                  21 : subMat.aa2num[(int) msaSeq[k][pos]];
         }
     }
     PSSMCalculator::Profile pssmRet = pssmCalculator.computePSSMFromMSA(setSize,centerSeqSize, (const char **) msaSequence, false);
@@ -790,12 +790,12 @@ int main (int, const char**) {
     SmithWaterman aligner(15000, subMat.alphabetSize, false);
     int8_t * tinySubMat = new int8_t[subMat.alphabetSize*subMat.alphabetSize];
 
-    aligner.ssw_init(s, s->getAlignmentProfile(), &subMat, subMat.alphabetSize, 2);
+    aligner.ssw_init(s, s->getAlignmentProfile(), &subMat, 2);
     int32_t maskLen = s->L / 2;
     int gap_open = 10;
     int gap_extend = 1;
     EvalueComputation evalueComputation(100000, &subMat, gap_open, gap_extend);
-    s_align alignment = aligner.ssw_align(dbSeq->int_sequence, dbSeq->L, gap_open, gap_extend, 0, 10000, &evalueComputation, 0, 0.0, maskLen);
+    s_align alignment = aligner.ssw_align(dbSeq->numSequence, dbSeq->L, gap_open, gap_extend, 0, 10000, &evalueComputation, 0, 0.0, maskLen);
     if(alignment.cigar){
         std::cout << "Cigar" << std::endl;
 
@@ -805,20 +805,20 @@ int main (int, const char**) {
             uint32_t length = SmithWaterman::cigar_int_to_len(alignment.cigar[c]);
             for (uint32_t i = 0; i < length; ++i){
                 if (letter == 'M') {
-                    fprintf(stdout,"%c",subMat.int2aa[dbSeq->int_sequence[targetPos]]);
-                    if (dbSeq->int_sequence[targetPos] == s->int_sequence[queryPos]){
+                    fprintf(stdout,"%c",subMat.num2aa[dbSeq->numSequence[targetPos]]);
+                    if (dbSeq->numSequence[targetPos] == s->numSequence[queryPos]){
                         fprintf(stdout, "|");
                     }
                     else fprintf(stdout, "*");
-                    fprintf(stdout,"%c",subMat.int2aa[s->int_sequence[queryPos]]);
+                    fprintf(stdout,"%c",subMat.num2aa[s->numSequence[queryPos]]);
                     ++queryPos;
                     ++targetPos;
                 } else {
                     if (letter == 'I'){
-                        fprintf(stdout,"%c |",subMat.int2aa[s->int_sequence[queryPos]]);
+                        fprintf(stdout,"%c |",subMat.num2aa[s->numSequence[queryPos]]);
                         ++queryPos;
                     } else{
-                        fprintf(stdout,"| %c",subMat.int2aa[dbSeq->int_sequence[targetPos]]);
+                        fprintf(stdout,"| %c",subMat.num2aa[dbSeq->numSequence[targetPos]]);
                         ++targetPos;
                     }
                 }
diff --git a/src/test/TestProfileStates.cpp b/src/test/TestProfileStates.cpp
index 6d10abb..d745866 100644
--- a/src/test/TestProfileStates.cpp
+++ b/src/test/TestProfileStates.cpp
@@ -25,7 +25,7 @@ int main (int, const char**) {
 
 
     std::cout << "Subustitution matrix:";
-    SubstitutionMatrix::print(subMat.subMatrix,subMat.int2aa,subMat.alphabetSize);
+    SubstitutionMatrix::print(subMat.subMatrix,subMat.num2aa,subMat.alphabetSize);
     //   BaseMatrix::print(subMat.subMatrix, subMat.alphabetSize);
     const char *seqs[1001];
     int counter = 0;
@@ -580,20 +580,16 @@ int main (int, const char**) {
     for (int k = 0; k < counter; ++k) {
         seqsCpy[k] = MultipleAlignment::initX(122);
         for (int pos = 0; pos < 122; ++pos) {
-//            seqs[k][pos] = (seqs[k][pos] == '-') ? MultipleAlignment::GAP : subMat.aa2int[(int) seqs[k][pos]];
-            seqsCpy[k][pos] = (seqs[k][pos] == '-') ? MultipleAlignment::GAP : subMat.aa2int[(int) seqs[k][pos]];
+//            seqs[k][pos] = (seqs[k][pos] == '-') ? MultipleAlignment::GAP : subMat.aa2num[(int) seqs[k][pos]];
+            seqsCpy[k][pos] = (seqs[k][pos] == '-') ? MultipleAlignment::GAP : static_cast<int>(subMat.aa2num[(int) seqs[k][pos]]);
         }
     }
 
     MultipleAlignment::MSAResult res(122, 122, counter, seqsCpy);
     MultipleAlignment::print(res, &subMat);
 
-    MsaFilter msaFilter(10000, counter, &subMat, par.gapOpen, par.gapExtend);
-    size_t filteredSetSize = res.setSize;
-
-    msaFilter.filter(res.setSize, res.centerLength, 0, 0,-20.0, 90, 100, (const char **)res.msaSequence, &filteredSetSize);
-
-    msaFilter.shuffleSequences((const char **) res.msaSequence, res.setSize);
+    MsaFilter msaFilter(10000, counter, &subMat, par.gapOpen.aminoacids, par.gapExtend.aminoacids);
+    size_t filteredSetSize = msaFilter.filter(res, 0, 0,-20.0, 90, 100);
 
 /*    std::cout << "Filtered:" << filterResult.setSize << std::endl;
 //    for(size_t k = 0; k < res.setSize; k++){
@@ -605,7 +601,7 @@ int main (int, const char**) {
         printf("k=%.3zu ", k);
         for(size_t pos = 0; pos < res.centerLength; pos++){
             char aa = filterResult.filteredMsaSequence[k][pos];
-            printf("%c", (aa < MultipleAlignment::NAA) ? subMat.int2aa[(int)aa] : '-' );
+            printf("%c", (aa < MultipleAlignment::NAA) ? subMat.num2aa[(int)aa] : '-' );
         }
         printf("\n");
     }
diff --git a/src/test/TestReduceMatrix.cpp b/src/test/TestReduceMatrix.cpp
index 9015a13..80a5cfa 100644
--- a/src/test/TestReduceMatrix.cpp
+++ b/src/test/TestReduceMatrix.cpp
@@ -17,47 +17,47 @@ int main (int, const char**) {
     const int reductionAlphabetSize = 17;
     Parameters& par = Parameters::getInstance();
     SubstitutionMatrix subMat(par.scoringMatrixFile.aminoacids, 2.0, 0);
-    subMat.print(subMat.subMatrix, subMat.int2aa,21);
+    subMat.print(subMat.subMatrix, subMat.num2aa,21);
     for(int i = 0; i<subMat.alphabetSize;i++)
-        printf("(%d, %c) ",i,subMat.int2aa[i]);
+        printf("(%d, %c) ",i,subMat.num2aa[i]);
     printf("\n");
-    ReducedMatrix redMat(subMat.probMatrix, subMat.subMatrixPseudoCounts, subMat.aa2int, subMat.int2aa, subMat.alphabetSize, reductionAlphabetSize, subMat.getBitFactor());
+    ReducedMatrix redMat(subMat.probMatrix, subMat.subMatrixPseudoCounts, subMat.aa2num, subMat.num2aa, subMat.alphabetSize, reductionAlphabetSize, subMat.getBitFactor());
     std::cout << "\n";
     printf("Normal alphabet : ");
     for(int i = 0; i<subMat.alphabetSize;i++)
-        printf("(%c) ",subMat.int2aa[i]);
+        printf("(%c) ",subMat.num2aa[i]);
     printf("\nReduced alphabet: ");
     for(int i = 0; i<redMat.alphabetSize;i++)
-        printf("(%c) ",redMat.int2aa[i]);
+        printf("(%c) ",redMat.num2aa[i]);
     std::cout << "\nReduced alphabet size: " << redMat.alphabetSize << "\n";
 
-    std::cout << "aa2int: \n";
+    std::cout << "aa2num: \n";
     for (char c = 'A'; c <= 'Z'; c++)
-        printf("%c%3d\t", c, subMat.aa2int[(int)c]);
+        printf("%c%3d\t", c, subMat.aa2num[(int)c]);
     std::cout << "\n";
 
-    std::cout << "int2aa: \n";
+    std::cout << "num2aa: \n";
     for (int i = 0; i < subMat.alphabetSize; i++)
-        printf("%d%3c\t", i, subMat.int2aa[i]);
+        printf("%d%3c\t", i, subMat.num2aa[i]);
     std::cout << "\n";
 
-    std::cout << "reduced aa2int:\n";
+    std::cout << "reduced aa2num:\n";
     for (char c = 'A'; c <= 'Z'; c++)
-        printf("%c%3d\t", c, redMat.aa2int[(int)c]);
+        printf("%c%3d\t", c, redMat.aa2num[(int)c]);
     std::cout << "\n";
 
-    std::cout << "reduced int2aa: \n";
+    std::cout << "reduced num2aa: \n";
     for (int i = 0; i < redMat.alphabetSize; i++)
-        printf("%d%3c\t", i, redMat.int2aa[i]);
+        printf("%d%3c\t", i, redMat.num2aa[i]);
     std::cout << "\n";
 
     printf("\n\nOriginal substitution matrix:\n");
-    subMat.print(subMat.subMatrix, subMat.int2aa,21);
-    subMat.print(subMat.probMatrix, subMat.int2aa,21);
+    subMat.print(subMat.subMatrix, subMat.num2aa,21);
+    subMat.print(subMat.probMatrix, subMat.num2aa,21);
 
     printf("\n\nReduced substitution matrix:\n");
-    subMat.print(redMat.subMatrix, redMat.int2aa,reductionAlphabetSize);
-    subMat.print(redMat.probMatrix, redMat.int2aa,reductionAlphabetSize);
+    subMat.print(redMat.subMatrix, redMat.num2aa,reductionAlphabetSize);
+    subMat.print(redMat.probMatrix, redMat.num2aa,reductionAlphabetSize);
 
 
     return 0;
diff --git a/src/test/TestSequenceIndex.cpp b/src/test/TestSequenceIndex.cpp
index 883b22c..b202fcf 100644
--- a/src/test/TestSequenceIndex.cpp
+++ b/src/test/TestSequenceIndex.cpp
@@ -48,7 +48,7 @@ int main (int, const char**) {
     if (s1res.second != S1.length())
         std::cout << "Diff length" << std::endl;
     for (size_t i = 0; i < s1res.second; i++) {
-        if (subMat.int2aa[s1res.first[i]] != S1char[i]) {
+        if (subMat.num2aa[s1res.first[i]] != S1char[i]) {
             std::cout << "Wrong data" << std::endl;
         }
     }
@@ -56,7 +56,7 @@ int main (int, const char**) {
     if (s2res.second != S2.length())
         std::cout << "Diff length" << std::endl;
     for (size_t i = 0; i < s2res.second; i++) {
-        if (subMat.int2aa[s2res.first[i]] != S2char[i]) {
+        if (subMat.num2aa[s2res.first[i]] != S2char[i]) {
             std::cout << "Wrong data" << std::endl;
         }
     }
@@ -66,14 +66,14 @@ int main (int, const char**) {
         std::cout << "Diff length" << std::endl;
 
     for (size_t i = 0; i < s3res.second; i++) {
-        if (subMat.int2aa[s3res.first[i]] != S3char[i]) {
+        if (subMat.num2aa[s3res.first[i]] != S3char[i]) {
             std::cout << "Wrong data" << std::endl;
         }
     }
 
     std::pair<const unsigned char *, const unsigned int> s4res = lookup.getSequence(3);
     for (size_t i = 0; i < s4res.second; i++) {
-        if (subMat.int2aa[s4res.first[i]] != S4char[i]) {
+        if (subMat.num2aa[s4res.first[i]] != S4char[i]) {
             std::cout << "Wrong data" << std::endl;
         }
     }
diff --git a/src/test/TestTanTan.cpp b/src/test/TestTanTan.cpp
index 27c8f4e..c95a1ec 100644
--- a/src/test/TestTanTan.cpp
+++ b/src/test/TestTanTan.cpp
@@ -18,7 +18,7 @@ int main (int, const char**) {
     Parameters& par = Parameters::getInstance();
     SubstitutionMatrix subMat(par.scoringMatrixFile.aminoacids, 2.0, 0);
     std::cout << "Substitution matrix:";
-    SubstitutionMatrix::print(subMat.subMatrix, subMat.int2aa, subMat.alphabetSize);
+    SubstitutionMatrix::print(subMat.subMatrix, subMat.num2aa, subMat.alphabetSize);
 
     const char *ref = "MTLHSNSTTSSLFPNISSSWIHSPSDAGLPPGTVTHFGSYNVSRAAGNFSSPDGTTDDPLGGHTVWQVVFIAFLTGILALVTIIGNILVIVSFKVNKQLKTVNNYFLLSLACADLIIGVISMNLFTTYIIMNRWALGNLACDLWLAIDYVASNASVMNLLVISFDRYFSITRPLTYRAKRTTKRAGVMIGLAWVISFVLWAPAILFWQYFVGKRTVPPGECFIQFLSEPTITFGTAIAAFYMPVTIMTILYWRIYKETEKRTKELAGLQASGTEAETENFVHPTGSSRSCSSYELQQQSMKRSNRRKYGRCHFWFTTKSWKPSSEQMDQDHSSSDSWNNNDAAASLENSASSDEEDIGSETRAIYSIVLKLPGHSTILNSTKLPSSDNLQVPEEELGMVDLERKADKLQAQKSVDDGGSFPKSFSKLPIQLESAVDTAKTSDVNSSVGKSTATLPLSFKEATLAKRFALKTRSQITKRKRMSLVKEKKAAQTLSAILLAFIITWTPYNIMVLVNTFCDSCIPKTFWNLGYWLCYINSTVNPVCYALCNKTFRTTFKMLLLCQCDKKKRRKQQYQRQSVIFHKRAPEQAL";
     const size_t len = strlen(ref);
@@ -26,7 +26,7 @@ int main (int, const char**) {
     refSeq.mapSequence(0, 0, ref, strlen(ref));
 
     char hardMaskTable[256];
-    std::fill_n(hardMaskTable, 256, subMat.aa2int[(int) 'X']);
+    std::fill_n(hardMaskTable, 256, subMat.aa2num[(int) 'X']);
     double probMatrix[21][21];
 
     const double *probMatrixPointers[64];
@@ -44,7 +44,7 @@ int main (int, const char**) {
 
     for(size_t i = 0; i < 100000; i++){
         for(int i = 0; i < refSeq.L; i++){
-            refInt[i] = (char) refSeq.int_sequence[i];
+            refInt[i] = (char) refSeq.numSequence[i];
         }
         tantan::maskSequences(refInt, refInt+len, 50 /*options.maxCycleLength*/,
                               probMatrixPointers,
@@ -54,8 +54,8 @@ int main (int, const char**) {
                               0.5 /*options.minMaskProb*/, hardMaskTable);
     }
     for(int i = 0; i < refSeq.L; i++){
-//        refInt[i] = (char) refSeq.int_sequence[i];
-        std::cout << subMat.int2aa[(int)refInt[i]];
+//        refInt[i] = (char) refSeq.sequence[i];
+        std::cout << subMat.num2aa[(int)refInt[i]];
 
     }
     std::cout << std::endl;
diff --git a/src/test/TestTaxExpr.cpp b/src/test/TestTaxExpr.cpp
index 1da4051..59479f5 100644
--- a/src/test/TestTaxExpr.cpp
+++ b/src/test/TestTaxExpr.cpp
@@ -1,5 +1,7 @@
 #include <iostream>
 #include <cassert>
+#include <tinyexpr.h>
+#include <ExpressionParser.h>
 #include "Util.h"
 #include "TaxonomyExpression.h"
 
@@ -8,109 +10,107 @@ const char* binary_name = "test_taxexpr";
 
 
 int main (int, const char**) {
-//    TaxonomyExpression expression("(2,2157),4751,33208,33090,(2759,!4751,!33208,!33090)");
-    TaxonomyExpression expression1("2");
     std::string path = "/Users/mad/Documents/databases//swissprot/sprot_new";
-    NcbiTaxonomy * taxonomy = NcbiTaxonomy::openTaxonomy(path);
-
-    if(expression1.isAncestorOf(*taxonomy, 1117) != -1){
-        std::cout << "Found bacteria" << std::endl;
+    NcbiTaxonomy *taxonomy = NcbiTaxonomy::openTaxonomy(path);
+    TaxonomyExpression parser("!2", *taxonomy);
+    if(parser.isAncestor(9606) == true){
+        std::cout << "Found human" << std::endl;
     } else{
         assert(false);
     }
 
-    if(expression1.isAncestorOf(*taxonomy, 33630) == -1){
+    if(parser.isAncestor(1117) == false){
         std::cout << "Alveolata not ancestor" << std::endl;
     } else{
         assert(false);
     }
 
-    TaxonomyExpression expression2("(2759&!9606)");
+    TaxonomyExpression expression2("(2759&&!9606)",*taxonomy);
 
-    if(expression2.isAncestorOf(*taxonomy, 33630) != -1){
+    if(expression2.isAncestor(33630)){
         std::cout << "Found Alveolata" << std::endl;
     } else{
         assert(false);
     }
 
-    if(expression2.isAncestorOf(*taxonomy, 9606) == -1){
+    if(expression2.isAncestor( 9606) == false){
         std::cout << "Homo sapiens not ancestor" << std::endl;
     } else{
         assert(false);
     }
 
-    TaxonomyExpression expression3("(2759&!61964),10239");
+    TaxonomyExpression expression3("2759||10239",*taxonomy);
 
-    if(expression3.isAncestorOf(*taxonomy, 114777) == 1){
+    if(expression3.isAncestor(114777)){
         std::cout << "Found Natrialba phage PhiCh1" << std::endl;
     } else{
         assert(false);
     }
 
-    if(expression3.isAncestorOf(*taxonomy, 2759) == 0){
+    if(expression3.isAncestor(2759)){
         std::cout << "Found Eukaryota" << std::endl;
     } else{
         assert(false);
     }
 
-    if(expression3.isAncestorOf(*taxonomy, 61964) == -1){
-        std::cout << "Enviromental sample in not in" << std::endl;
+    if(expression3.isAncestor(61964)){
+        std::cout << "Found Enviromental sample" << std::endl;
     } else{
         assert(false);
     }
 
-    TaxonomyExpression expression4("2759,10239");
 
-    if(expression4.isAncestorOf(*taxonomy, 114777) == 1){
-        std::cout << "Found Natrialba phage PhiCh1" << std::endl;
-    } else{
-        assert(false);
-    }
+    TaxonomyExpression expression5("!2759",*taxonomy);
 
-    if(expression4.isAncestorOf(*taxonomy, 2759) == 0){
-        std::cout << "Found Eukaryota" << std::endl;
+    if(expression5.isAncestor(2)){
+        std::cout << "Found Bacteria" << std::endl;
     } else{
         assert(false);
     }
 
-    if(expression4.isAncestorOf(*taxonomy, 61964) == 0){
-        std::cout << "Found Enviromental sample" << std::endl;
+    if(expression5.isAncestor(2759) == false){
+        std::cout << "Eukaryota not in" << std::endl;
     } else{
         assert(false);
     }
 
 
-    TaxonomyExpression expression5("!2759");
+    TaxonomyExpression expression6("(2||2759)",*taxonomy);
 
-    if(expression5.isAncestorOf(*taxonomy, 2) == 0){
+    if(expression6.isAncestor(2)){
         std::cout << "Found Bacteria" << std::endl;
     } else{
         assert(false);
     }
 
-    if(expression5.isAncestorOf(*taxonomy, 2759) == -1){
-        std::cout << "Eukaryota not in" << std::endl;
+    if(expression6.isAncestor(2759)){
+        std::cout << "Found Eukaryota" << std::endl;
     } else{
         assert(false);
     }
 
+    if(expression6.isAncestor( 10239) == false){
+        std::cout << "Virus sample not in" << std::endl;
+    } else{
+        assert(false);
+    }
 
-    TaxonomyExpression expression6("(2|2759)");
+    TaxonomyExpression expression7("(2&&!1117)",*taxonomy);
 
-    if(expression6.isAncestorOf(*taxonomy, 2) == 0){
-        std::cout << "Found Bacteria" << std::endl;
+    if(expression7.isAncestor(57723)){
+        std::cout << "Found Acidobacteria" << std::endl;
     } else{
         assert(false);
     }
 
-    if(expression6.isAncestorOf(*taxonomy, 2759) == 0){
-        std::cout << "Found Eukaryota" << std::endl;
+    if(expression7.isAncestor(1117) == false){
+        std::cout << "Cyanobacteria not in" << std::endl;
     } else{
         assert(false);
     }
 
-    if(expression6.isAncestorOf(*taxonomy, 10239) == -1){
-        std::cout << "Virus sample not in" << std::endl;
+    if(expression7.isAncestor(9606) == false){
+        std::cout << "Human not in" << std::endl;
     } else{
         assert(false);
     }
diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt
index 07d9d69..24c6173 100644
--- a/src/util/CMakeLists.txt
+++ b/src/util/CMakeLists.txt
@@ -39,6 +39,7 @@ set(util_source_files
         util/profile2cs.cpp
         util/profile2pssm.cpp
         util/profile2seq.cpp
+        util/result2dnamsa.cpp
         util/result2flat.cpp
         util/result2msa.cpp
         util/result2rbh.cpp
@@ -62,6 +63,7 @@ set(util_source_files
         util/translatenucs.cpp
         util/translateaa.cpp
         util/tsv2db.cpp
+        util/tar2db.cpp
         util/proteinaln2nucl.cpp
         util/versionstring.cpp
         util/diskspaceavail.cpp
diff --git a/src/util/alignall.cpp b/src/util/alignall.cpp
index 07a85c4..6a9f424 100644
--- a/src/util/alignall.cpp
+++ b/src/util/alignall.cpp
@@ -15,8 +15,18 @@
 
 int alignall(int argc, const char **argv, const Command &command) {
     Parameters &par = Parameters::getInstance();
+    par.overrideParameterDescription(par.PARAM_ALIGNMENT_MODE, "How to compute the alignment:\n0: automatic\n1: only score and end_pos\n2: also start_pos and cov\n3: also seq.id", NULL, 0);
     par.parseParameters(argc, argv, command, true, 0, 0);
 
+    if (par.alignmentMode == Parameters::ALIGNMENT_MODE_UNGAPPED) {
+        Debug(Debug::ERROR) << "Use rescorediagonal for ungapped alignment mode.\n";
+        EXIT(EXIT_FAILURE);
+    }
+    if (par.addBacktrace == true) {
+        par.alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID;
+    }
+    unsigned int swMode = Alignment::initSWMode(par.alignmentMode, par.covThr, par.seqIdThr);
+
     DBReader<unsigned int> tdbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
     tdbr.open(DBReader<unsigned int>::NOSORT);
     if (par.preloadMode != Parameters::PRELOAD_MODE_MMAP) {
@@ -24,21 +34,26 @@ int alignall(int argc, const char **argv, const Command &command) {
     }
     const int targetSeqType = tdbr.getDbtype();
 
+    int gapOpen, gapExtend;
     BaseMatrix *subMat;
     if (Parameters::isEqualDbtype(targetSeqType, Parameters::DBTYPE_NUCLEOTIDES)) {
-        subMat = new NucleotideMatrix(par.scoringMatrixFile.nucleotides, 1.0, 0.0);
+        subMat = new NucleotideMatrix(par.scoringMatrixFile.nucleotides, 1.0, par.scoreBias);
+        gapOpen = par.gapOpen.nucleotides;
+        gapExtend = par.gapExtend.nucleotides;
     } else {
         // keep score bias at 0.0 (improved ROC)
-        subMat = new SubstitutionMatrix(par.scoringMatrixFile.aminoacids, 2.0, 0.0);
+        subMat = new SubstitutionMatrix(par.scoringMatrixFile.aminoacids, 2.0, par.scoreBias);
+        gapOpen = par.gapOpen.aminoacids;
+        gapExtend = par.gapExtend.aminoacids;
     }
 
     DBReader<unsigned int> dbr_res(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
     dbr_res.open(DBReader<unsigned int>::LINEAR_ACCCESS);
 
-    DBWriter resultWriter(par.db3.c_str(), par.db3Index.c_str(), par.threads, par.compressed, Parameters::DBTYPE_PREFILTER_RES);
+    DBWriter resultWriter(par.db3.c_str(), par.db3Index.c_str(), par.threads, par.compressed, Parameters::DBTYPE_GENERIC_DB);
     resultWriter.open();
 
-    EvalueComputation evaluer(tdbr.getAminoAcidDBSize(), subMat, par.gapOpen, par.gapExtend);
+    EvalueComputation evaluer(tdbr.getAminoAcidDBSize(), subMat, gapOpen, gapExtend);
     const size_t flushSize = 100000000;
     size_t iterations = static_cast<int>(ceil(static_cast<double>(dbr_res.getSize()) / static_cast<double>(flushSize)));
 
@@ -53,10 +68,10 @@ int alignall(int argc, const char **argv, const Command &command) {
             thread_idx = (unsigned int) omp_get_thread_num();
 #endif
 
-            Matcher matcher(targetSeqType, par.maxSeqLen, subMat, &evaluer, par.compBiasCorrection, par.gapOpen, par.gapExtend);
+            Matcher matcher(targetSeqType, par.maxSeqLen, subMat, &evaluer, par.compBiasCorrection, gapOpen, gapExtend, par.zdrop);
 
-            Sequence query(par.maxSeqLen, targetSeqType, subMat, par.kmerSize, par.spacedKmer, par.compBiasCorrection);
-            Sequence target(par.maxSeqLen, targetSeqType, subMat, par.kmerSize, par.spacedKmer, par.compBiasCorrection);
+            Sequence query(par.maxSeqLen, targetSeqType, subMat, 0, false, par.compBiasCorrection);
+            Sequence target(par.maxSeqLen, targetSeqType, subMat, 0, false, par.compBiasCorrection);
 
             char buffer[1024 + 32768];
 
@@ -101,11 +116,11 @@ int alignall(int argc, const char **argv, const Command &command) {
                         }
 
                         const bool isIdentity = (queryId == targetId && par.includeIdentity) ? true : false;
-                        Matcher::result_t result = matcher.getSWResult(&target, INT_MAX, false, par.covMode, par.covThr, FLT_MAX,
-                                                                       par.alignmentMode, par.seqIdMode, isIdentity);
+                        Matcher::result_t result = matcher.getSWResult(&target, INT_MAX, false, par.covMode, par.covThr, par.evalThr,
+                                                                       swMode, par.seqIdMode, isIdentity);
                         // checkCriteria and Util::canBeCovered always work together
                         if (Alignment::checkCriteria(result, isIdentity, par.evalThr, par.seqIdThr, par.alnLenThr, par.covMode, par.covThr)) {
-                            size_t len = Matcher::resultToBuffer(tmpBuff, result, true, false);
+                            size_t len = Matcher::resultToBuffer(tmpBuff, result, par.addBacktrace);
                             resultWriter.writeAdd(buffer, queryIdLen + len, thread_idx);
                         }
                     }
diff --git a/src/util/alignbykmer.cpp b/src/util/alignbykmer.cpp
index f319ff3..f28d5a3 100644
--- a/src/util/alignbykmer.cpp
+++ b/src/util/alignbykmer.cpp
@@ -41,6 +41,7 @@ int alignbykmer(int argc, const char **argv, const Command &command) {
         querySeqType = qdbr->getDbtype();
     }
 
+    int gapOpen, gapExtend;
     if(Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)){
         par.alphabetSize = 5;
         if(par.PARAM_SPACED_KMER_MODE.wasSet == false) {
@@ -49,18 +50,15 @@ int alignbykmer(int argc, const char **argv, const Command &command) {
         if(par.PARAM_K.wasSet == false) {
             par.kmerSize = 9;
         }
-        if(par.PARAM_GAP_OPEN.wasSet == false){
-            par.gapOpen = 5;
-        }
-        if(par.PARAM_GAP_EXTEND.wasSet == false) {
-            par.gapExtend = 2;
-        }
-
+        gapOpen = par.gapOpen.nucleotides;
+        gapExtend = par.gapExtend.nucleotides;
     } else {
         if(par.PARAM_K.wasSet == false) {
             par.kmerSize = 4;
         }
         par.alphabetSize = 21;
+        gapOpen = par.gapOpen.aminoacids;
+        gapExtend = par.gapExtend.aminoacids;
     }
     par.printParameters(command.cmd, argc, argv, *command.params);
 
@@ -75,18 +73,18 @@ int alignbykmer(int argc, const char **argv, const Command &command) {
     if (Parameters::isEqualDbtype(querySeqType,Parameters::DBTYPE_NUCLEOTIDES)) {
         subMat = new NucleotideMatrix(par.scoringMatrixFile.nucleotides, 1.0, 0.0);
     } else {
-        if (par.alphabetSize == 21) {
+        if (par.alphabetSize.aminoacids == 21) {
             subMat = new SubstitutionMatrix(par.scoringMatrixFile.aminoacids, 2.0, 0.0);
         } else {
             SubstitutionMatrix sMat(par.scoringMatrixFile.aminoacids, 2.0, 0.0);
-            subMat = new ReducedMatrix(sMat.probMatrix, sMat.subMatrixPseudoCounts, sMat.aa2int, sMat.int2aa,
-                    sMat.alphabetSize, par.alphabetSize, 2.0);
-            SubstitutionMatrix::print(subMat->subMatrix, subMat->int2aa, subMat->alphabetSize );
+            subMat = new ReducedMatrix(sMat.probMatrix, sMat.subMatrixPseudoCounts, sMat.aa2num, sMat.num2aa,
+                    sMat.alphabetSize, par.alphabetSize.aminoacids, 2.0);
+            SubstitutionMatrix::print(subMat->subMatrix, subMat->num2aa, subMat->alphabetSize );
         }
     }
     ScoreMatrix _2merSubMatrix = ExtendedSubstitutionMatrix::calcScoreMatrix(*subMat, 2);
 
-    EvalueComputation evaluer(tdbr->getAminoAcidDBSize(), subMat, par.gapOpen, par.gapExtend);
+    EvalueComputation evaluer(tdbr->getAminoAcidDBSize(), subMat, gapOpen, gapExtend);
 
     DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), par.threads, par.compressed, Parameters::DBTYPE_ALIGNMENT_RES);
     resultWriter.open();
@@ -170,7 +168,7 @@ int alignbykmer(int argc, const char **argv, const Command &command) {
             Sequence target(par.maxSeqLen, targetSeqType, subMat, par.kmerSize, par.spacedKmer, false, true, par.spacedKmerPattern);
             KmerGenerator kmerGenerator(par.kmerSize, subMat->alphabetSize, 70.0);
             kmerGenerator.setDivideStrategy(NULL, &_2merSubMatrix);
-            size_t lookupSize = MathUtil::ipow<size_t>(par.alphabetSize, par.kmerSize);
+            size_t lookupSize = MathUtil::ipow<size_t>(subMat->alphabetSize, par.kmerSize);
             unsigned short * queryPosLookup = new unsigned short[lookupSize];
             memset(queryPosLookup, 255, lookupSize * sizeof(unsigned short) );
 
@@ -199,7 +197,7 @@ int alignbykmer(int argc, const char **argv, const Command &command) {
                 query.mapSequence(id, queryId, querySeq, qdbr->getSeqLen(id));
 
                 while (query.hasNextKmer()) {
-                    const int *kmer = query.nextKmer();
+                    const unsigned char *kmer = query.nextKmer();
                     unsigned short pos = query.getCurrentPosition();
                     unsigned short kmerIdx = idxer.int2index(kmer);
                     if (queryPosLookup[kmerIdx] == USHRT_MAX) {
@@ -224,7 +222,7 @@ int alignbykmer(int argc, const char **argv, const Command &command) {
                     target.mapSequence(targetId, dbKey, targetSeq, tdbr->getSeqLen(targetId));
                     size_t kmerPosSize = 0;
                     while (target.hasNextKmer()) {
-                        const int *kmer = target.nextKmer();
+                        const unsigned char *kmer = target.nextKmer();
                         unsigned short kmerIdx = idxer.int2index(kmer);
                         if (queryPosLookup[kmerIdx] != USHRT_MAX) {
                             unsigned short pos_j = target.getCurrentPosition();
@@ -311,7 +309,7 @@ int alignbykmer(int argc, const char **argv, const Command &command) {
                             if (stretcheVec[currStretche].i_start > stretcheVec[prevPotentialStretche].i_end &&
                                     stretcheVec[currStretche].j_start > stretcheVec[prevPotentialStretche].i_end) {
                                 int bestScorePathPrevIsLast = dpMatrixRow[prevPotentialStretche].pathScore;
-                                int distance =  par.gapOpen + (stretcheVec[prevPotentialStretche].i_end - stretcheVec[currStretche].i_start)*par.gapExtend;
+                                int distance =  gapOpen + (stretcheVec[prevPotentialStretche].i_end - stretcheVec[currStretche].i_start) * gapExtend;
                                 int costOfPrevToCurrTransition = distance;
                                 int currScore = stretcheVec[currStretche].kmerCnt*par.kmerSize*2;
                                 int currScoreWithPrev = bestScorePathPrevIsLast + costOfPrevToCurrTransition + currScore;
@@ -347,19 +345,19 @@ int alignbykmer(int argc, const char **argv, const Command &command) {
 
 //                        for (int i = strechtPath[stretch].i_end, j = strechtPath[stretch].j_end;
 //                             i < strechtPath[stretch - 1].i_start; i++, j++) {
-//                            std::cout << subMat->int2aa[query.int_sequence[i]];
+//                            std::cout << subMat->num2aa[query.sequence[i]];
 //                        }
 //                        std::cout << std::endl;
 //
 //                        for (int i = strechtPath[stretch].i_end, j = strechtPath[stretch].j_end;
 //                             i < strechtPath[stretch - 1].i_start; i++, j++) {
-//                            std::cout << subMat->int2aa[target.int_sequence[j]];
+//                            std::cout << subMat->num2aa[target.sequence[j]];
 //                        }
 //                        std::cout << std::endl;
 
                         for (int i = strechtPath[stretch].i_end, j = strechtPath[stretch].j_end;
                              i < strechtPath[stretch - 1].i_start && j < strechtPath[stretch - 1].j_start; i++, j++) {
-                            int curr = subMat->subMatrix[query.int_sequence[i]][target.int_sequence[j]];
+                            int curr = subMat->subMatrix[query.numSequence[i]][target.numSequence[j]];
                             score = curr + score;
 //                            score = (score < 0) ? 0 : score;
                             scores[pos] = score;
@@ -373,7 +371,7 @@ int alignbykmer(int argc, const char **argv, const Command &command) {
                         score = 0;
                         for (int i = strechtPath[stretch - 1].i_start, j = strechtPath[stretch - 1].j_start;
                              i > strechtPath[stretch].i_end && j > strechtPath[stretch].j_end; i--, j--) {
-                            int curr = subMat->subMatrix[query.int_sequence[i]][target.int_sequence[j]];
+                            int curr = subMat->subMatrix[query.numSequence[i]][target.numSequence[j]];
                             score = curr + score;
 //                            score = (score < 0) ? 0 : score;
                             if (scores[pos] + score > maxScore) {
@@ -395,7 +393,7 @@ int alignbykmer(int argc, const char **argv, const Command &command) {
                         int maxScore = 0;
                         int score = 0;
                         for (int i = strechtPath[strechtPath.size()-1].i_start, j = strechtPath[strechtPath.size()-1].j_start; i > -1 && j > -1; i--, j--) {
-                            int curr = subMat->subMatrix[query.int_sequence[i]][target.int_sequence[j]];
+                            int curr = subMat->subMatrix[query.numSequence[i]][target.numSequence[j]];
                             score = curr + score;
                             //                            score = (score < 0) ? 0 : score;
                             if (score > maxScore) {
@@ -406,7 +404,7 @@ int alignbykmer(int argc, const char **argv, const Command &command) {
                         score = 0;
 
                         for (int i = strechtPath[0].i_end, j = strechtPath[0].j_end; i < query.L && j < target.L; i++, j++) {
-                            int curr = subMat->subMatrix[query.int_sequence[i]][target.int_sequence[j]];
+                            int curr = subMat->subMatrix[query.numSequence[i]][target.numSequence[j]];
                             score = curr + score;
                             //                            score = (score < 0) ? 0 : score;
                             if (score > maxScore) {
@@ -428,27 +426,27 @@ int alignbykmer(int argc, const char **argv, const Command &command) {
                     for (int stretch = strechtPath.size()-1; stretch > -1 ; stretch--) {
                         for (size_t i = strechtPath[stretch].i_start, j = strechtPath[stretch].j_start;
                                i < strechtPath[stretch].i_end; i++, j++) {
-//                            querystr.push_back(subMat->int2aa[query.int_sequence[i]]);
-//                            targetstr.push_back(subMat->int2aa[target.int_sequence[j]]);
+//                            querystr.push_back(subMat->num2aa[query.sequence[i]]);
+//                            targetstr.push_back(subMat->num2aa[target.sequence[j]]);
                             bt.push_back('M');
-                            ids += (query.int_sequence[i] == target.int_sequence[j]);
-                            score += subMat->subMatrix[query.int_sequence[i]][target.int_sequence[j]];
+                            ids += (query.numSequence[i] == target.numSequence[j]);
+                            score += subMat->subMatrix[query.numSequence[i]][target.numSequence[j]];
                         }
                         if (stretch > 0) {
-                            score -= par.gapOpen;
+                            score -= gapOpen;
                             if (strechtPath[stretch-1].i_start==strechtPath[stretch].i_end) {
                                 for (size_t pos = strechtPath[stretch].j_end; pos < strechtPath[stretch-1].j_start; pos++) {
 //                                    querystr.push_back('-');
-//                                    targetstr.push_back(subMat->int2aa[target.int_sequence[pos]]);
+//                                    targetstr.push_back(subMat->num2aa[target.sequence[pos]]);
                                     bt.push_back('I');
-                                    score -= par.gapExtend;
+                                    score -= gapExtend;
                                 }
                             } else {
                                 for (size_t pos = strechtPath[stretch].i_end; pos < strechtPath[stretch-1].i_start; pos++) {
-//                                    querystr.push_back(subMat->int2aa[query.int_sequence[pos]]);
+//                                    querystr.push_back(subMat->num2aa[query.sequence[pos]]);
 //                                    targetstr.push_back('-');
                                     bt.push_back('D');
-                                    score -= par.gapExtend;
+                                    score -= gapExtend;
                                 }
                             }
                         }
diff --git a/src/util/apply.cpp b/src/util/apply.cpp
index 7d5d1b0..dfe9c0a 100644
--- a/src/util/apply.cpp
+++ b/src/util/apply.cpp
@@ -4,6 +4,12 @@
 #include "Util.h"
 #include "Debug.h"
 
+#if defined(__CYGWIN__) || defined(WASM)
+int apply(int argc, const char **argv, const Command& command) {
+    Debug(Debug::ERROR) << "apply is not supported on Windows/Cygwin\n";
+    EXIT(EXIT_FAILURE);
+}
+#else
 #include <climits>
 #include <unistd.h>
 #include <fcntl.h>
@@ -342,7 +348,7 @@ int apply(int argc, const char **argv, const Command& command) {
                     unsigned int key = reader.getDbKey(i);
                     char *data = reader.getData(i, thread);
                     if (*data == '\0') {
-                        writer.writeData(NULL, 0, key, thread);
+                        writer.writeData(NULL, 0, key, 0);
                         continue;
                     }
 
@@ -410,4 +416,4 @@ int apply(int argc, const char **argv, const Command& command) {
 
     return EXIT_SUCCESS;
 }
-
+#endif
diff --git a/src/util/clusthash.cpp b/src/util/clusthash.cpp
index 1bb3a4a..e377c42 100644
--- a/src/util/clusthash.cpp
+++ b/src/util/clusthash.cpp
@@ -1,167 +1,192 @@
-//
-// Created by mad on 2/25/16.
-//
-
-#include <limits>
-#include <string>
-#include <vector>
-#include <iomanip>
-#include <algorithm>
-
-#include "ReducedMatrix.h"
 #include "DBWriter.h"
-#include "SubstitutionMatrix.h"
 #include "Util.h"
 #include "Parameters.h"
 #include "Matcher.h"
 #include "Debug.h"
 #include "DBReader.h"
+#include "ReducedMatrix.h"
 #include "DistanceCalculator.h"
+#include "Orf.h"
+#include "omptl/omptl_algorithm"
 
 #ifdef OPENMP
 #include <omp.h>
 #endif
 
-void setClustHashDefaults(Parameters *p) {
-    p->alphabetSize = Parameters::CLUST_HASH_DEFAULT_ALPH_SIZE;
-
-}
-
-int clusthash(int argc, const char **argv, const Command& command) {
-    Parameters& par = Parameters::getInstance();
-    setClustHashDefaults(&par);
+int clusthash(int argc, const char **argv, const Command &command) {
+    Parameters &par = Parameters::getInstance();
+    par.alphabetSize = MultiParam<int>(Parameters::CLUST_HASH_DEFAULT_ALPH_SIZE,5);
+    par.seqIdThr = (float)Parameters::CLUST_HASH_DEFAULT_MIN_SEQ_ID/100.0f;
     par.parseParameters(argc, argv, command, true, 0, 0);
 
-    SubstitutionMatrix subMat(par.scoringMatrixFile.aminoacids, 2.0, -0.2);
-    ReducedMatrix redSubMat(subMat.probMatrix, subMat.subMatrixPseudoCounts, subMat.aa2int, subMat.int2aa, subMat.alphabetSize, par.alphabetSize, 2.0);
+    DBReader<unsigned int> reader(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA | DBReader<unsigned int>::USE_INDEX);
+    reader.open(DBReader<unsigned int>::LINEAR_ACCCESS);
+    if (par.preloadMode != Parameters::PRELOAD_MODE_MMAP) {
+        reader.readMmapedDataInMemory();
+    }
 
-    DBReader<unsigned int> seqDbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_DATA|DBReader<unsigned int>::USE_INDEX);
-    seqDbr.open(DBReader<unsigned int>::NOSORT);
-    seqDbr.readMmapedDataInMemory();
+    const bool isNuclInput = Parameters::isEqualDbtype(reader.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES);
+    BaseMatrix *subMat = NULL;
+    if (isNuclInput == false) {
+        SubstitutionMatrix sMat(par.scoringMatrixFile.aminoacids, 2.0, -0.2);
+        subMat = new ReducedMatrix(sMat.probMatrix, sMat.subMatrixPseudoCounts, sMat.aa2num, sMat.num2aa, sMat.alphabetSize, par.alphabetSize.aminoacids, 2.0);
+    }
 
-    DBWriter dbw(par.db2.c_str(), par.db2Index.c_str(), par.threads, par.compressed, Parameters::DBTYPE_PREFILTER_RES);
-    dbw.open();
-    Debug(Debug::INFO) << "Hashing sequences ... \n";
-    std::pair<size_t, unsigned int> * hashSeqPair = new  std::pair<size_t, unsigned int>[seqDbr.getSize()+1];
-    hashSeqPair[seqDbr.getSize()] = std::make_pair(UINT_MAX, 0); // needed later to check if one of array
-    Debug::Progress progress(seqDbr.getSize());
+    DBWriter writer(par.db2.c_str(), par.db2Index.c_str(), par.threads, par.compressed, Parameters::DBTYPE_ALIGNMENT_RES);
+    writer.open();
+    Debug(Debug::INFO) << "Hashing sequences...\n";
+    std::pair<size_t, unsigned int> *hashSeqPair = new std::pair<size_t, unsigned int>[reader.getSize() + 1];
+    // needed later to check if one of array
+    hashSeqPair[reader.getSize()] = std::make_pair(UINT_MAX, 0);
+    Debug::Progress progress(reader.getSize());
 #pragma omp parallel
     {
         unsigned int thread_idx = 0;
 #ifdef OPENMP
         thread_idx = static_cast<unsigned int>(omp_get_thread_num());
 #endif
-        Sequence seq(par.maxSeqLen, Parameters::DBTYPE_AMINO_ACIDS, &redSubMat, 0, false, false);
-#pragma omp for schedule(dynamic, 10000)
-        for(size_t id = 0; id < seqDbr.getSize(); id++){
-            progress.updateProgress();
-            unsigned int queryKey = seqDbr.getDbKey(id);
-            char * data = seqDbr.getData(id, thread_idx);
-            seq.mapSequence(id, queryKey, data, seqDbr.getSeqLen(id));
-            size_t seqHash = Util::hash(seq.int_sequence, seq.L);
-            hashSeqPair[id] = std::make_pair(seqHash, id);
+        if (isNuclInput) {
+#pragma omp for schedule(dynamic, 100)
+            for (size_t id = 0; id < reader.getSize(); ++id) {
+                progress.updateProgress();
+                char *data = reader.getData(id, thread_idx);
+                size_t length = reader.getSeqLen(id);
+                const size_t INITIAL_VALUE = 0;
+                const size_t A = 31;
+                size_t h1 = INITIAL_VALUE;
+                size_t h2 = INITIAL_VALUE;
+                for (size_t i = 0; i < length; ++i){
+                    h1 = ((h1*A) + data[i]);
+                    h2 = ((h2*A) + Orf::complement(data[length - i - 1]));
+                }
+                hashSeqPair[id] = std::make_pair(std::min(h1, h2), id);
+            }
+        } else {
+            Sequence seq(par.maxSeqLen, reader.getDbtype(), subMat, 0, false, false);
+#pragma omp for schedule(dynamic, 100)
+            for (size_t id = 0; id < reader.getSize(); ++id) {
+                progress.updateProgress();
+                char *data = reader.getData(id, thread_idx);
+                size_t length = reader.getSeqLen(id);
+                size_t seqHash;
+                seq.mapSequence(id, 0, data, length);
+                seqHash = Util::hash(seq.numSequence, seq.L);
+                hashSeqPair[id] = std::make_pair(seqHash, id);
+            }
         }
     }
 
-
-
     // sort by hash and set up the pointer for parallel processing
-    std::sort(hashSeqPair, hashSeqPair + seqDbr.getSize());
+    omptl::sort(hashSeqPair, hashSeqPair + reader.getSize());
+
     size_t uniqHashes = 1;
     size_t prevHash = hashSeqPair[0].first;
-    for(size_t id = 0; id < seqDbr.getSize(); id++) {
-        if(prevHash !=  hashSeqPair[id].first){
+    for (size_t id = 0; id < reader.getSize(); id++) {
+        if (prevHash != hashSeqPair[id].first) {
             uniqHashes++;
         }
-        prevHash =  hashSeqPair[id].first;
+        prevHash = hashSeqPair[id].first;
     }
-    std::pair<size_t, unsigned int> ** hashLookup = new std::pair<size_t, unsigned int> *[uniqHashes];
+    std::pair<size_t, unsigned int> **hashLookup = new std::pair<size_t, unsigned int>*[uniqHashes];
     hashLookup[0] = hashSeqPair;
     size_t currKey = 1;
     prevHash = hashSeqPair[0].first;
-    for(size_t id = 0; id < seqDbr.getSize(); id++) {
+    for (size_t id = 0; id < reader.getSize(); ++id) {
         if (prevHash != hashSeqPair[id].first) {
             hashLookup[currKey] = (hashSeqPair + id);
             currKey++;
         }
         prevHash = hashSeqPair[id].first;
     }
-    Debug(Debug::INFO) << "Compute " << uniqHashes << " unique hashes.\n";
 
+    Debug(Debug::INFO) << "Found " << uniqHashes << " unique hashes\n";
 #pragma omp parallel
     {
+        int thread_idx = 0;
+#ifdef OPENMP
+        thread_idx = omp_get_thread_num();
+#endif
+
         std::vector<unsigned int> setIds;
+        setIds.reserve(300);
         std::vector<bool> found;
+        found.reserve(300);
+        std::string result;
+        result.reserve(1024);
+        char buffer[64];
 
-#pragma omp for schedule(dynamic, 2)
-        for(size_t hashId = 0; hashId < uniqHashes; hashId++) {
-            size_t initHash = hashLookup[hashId]->first;
-            size_t pos = 0;
+#pragma omp for schedule(dynamic, 10)
+        for (size_t hashId = 0; hashId < uniqHashes; ++hashId) {
             progress.updateProgress();
 
-            int thread_idx = 0;
-#ifdef OPENMP
-            thread_idx = omp_get_thread_num();
-#endif
-            while(hashLookup[hashId][pos].first == initHash ){
+            size_t initHash = hashLookup[hashId]->first;
+            size_t pos = 0;
+            while (hashLookup[hashId][pos].first == initHash) {
                 setIds.push_back(hashLookup[hashId][pos].second);
                 found.push_back(false);
                 pos++;
             }
-            for(size_t i = 0; i < setIds.size(); i++) {
-                unsigned int queryLength = seqDbr.getSeqLen(setIds[i]);
-                const char * querySeq =  seqDbr.getData(setIds[i], thread_idx);
-                std::stringstream swResultsSs;
-                swResultsSs << seqDbr.getDbKey(setIds[i]) << "\t";
-                swResultsSs << 255 << "\t";
-                swResultsSs << std::fixed << std::setprecision(3) << 1.0f << "\t";
-                swResultsSs << std::scientific << 0 << "\t";
-                swResultsSs << 0 << "\t";
-                swResultsSs << queryLength - 1 << "\t";
-                swResultsSs << queryLength << "\t";
-                swResultsSs << 0 << "\t";
-                swResultsSs << queryLength - 1 << "\t";
-                swResultsSs << queryLength << "\n";
-                if(found[i] == true){
+            for (size_t i = 0; i < setIds.size(); i++) {
+                unsigned int queryKey = reader.getDbKey(setIds[i]);
+                unsigned int queryLength = reader.getSeqLen(setIds[i]);
+                const char *querySeq = reader.getData(setIds[i], thread_idx);
+                result.append(SSTR(queryKey));
+                result.append("\t255\t1.00\t0\t0\t");
+                result.append(SSTR(queryLength - 1));
+                result.append(1, '\t');
+                result.append(SSTR(queryLength));
+                result.append("\t0\t");
+                result.append(SSTR(queryLength - 1));
+                result.append(1, '\t');
+                result.append(SSTR(queryLength));
+                result.append(1, '\n');
+                if (found[i] == true) {
                     goto outer;
                 }
 
                 for (size_t j = 0; j < setIds.size(); j++) {
-                    if(found[j] == true)
+                    if (found[j] == true) {
                         continue;
-                    unsigned int targetLength = seqDbr.getSeqLen(setIds[j]);
-                    if(i != j && queryLength == targetLength){
-                        const char * targetSeq = seqDbr.getData(setIds[j], thread_idx);
-                        unsigned int distance = DistanceCalculator::computeInverseHammingDistance(querySeq, targetSeq,
-                                                                                                  queryLength);
-                        float seqId = (static_cast<float>(distance))/static_cast<float>(queryLength);
-                        if(seqId >= par.seqIdThr) {
-                            swResultsSs << seqDbr.getDbKey(setIds[j]) << "\t";
-                            swResultsSs << 255 << "\t";
-                            swResultsSs << std::fixed << std::setprecision(3) << seqId << "\t";
-                            swResultsSs << std::scientific << 0 << "\t";
-                            swResultsSs << 0 << "\t";
-                            swResultsSs << queryLength - 1 << "\t";
-                            swResultsSs << queryLength << "\t";
-                            swResultsSs << 0 << "\t";
-                            swResultsSs << queryLength - 1 << "\t";
-                            swResultsSs << queryLength << "\n";
+                    }
+                    unsigned int targetLength = reader.getSeqLen(setIds[j]);
+                    if (i != j && queryLength == targetLength) {
+                        const char *targetSeq = reader.getData(setIds[j], thread_idx);
+                        unsigned int distance = DistanceCalculator::computeInverseHammingDistance(querySeq, targetSeq, queryLength);
+                        const float seqId = (static_cast<float>(distance)) / static_cast<float>(queryLength);
+                        if (seqId >= par.seqIdThr) {
+                            result.append(SSTR(reader.getDbKey(setIds[j])));
+                            result.append("\t255\t");
+                            Util::fastSeqIdToBuffer(seqId, buffer);
+                            result.append(buffer);
+                            result.append("\t0\t0\t");
+                            result.append(SSTR(queryLength - 1));
+                            result.append(1, '\t');
+                            result.append(SSTR(queryLength));
+                            result.append("\t0\t");
+                            result.append(SSTR(queryLength - 1));
+                            result.append(1, '\t');
+                            result.append(SSTR(queryLength));
+                            result.append(1, '\n');
                             found[j] = true;
                         }
                     }
                 }
                 outer:
-                std::string swResultsString = swResultsSs.str();
-                const char* swResultsStringData = swResultsString.c_str();
-                dbw.writeData(swResultsStringData, swResultsString.length(), seqDbr.getDbKey(setIds[i]), thread_idx);
+                writer.writeData(result.c_str(), result.length(), queryKey, thread_idx);
+                result.clear();
             }
             setIds.clear();
             found.clear();
         }
     }
-    delete [] hashLookup;
-    delete [] hashSeqPair;
-    seqDbr.close();
-    dbw.close();
-    return 0;
+    writer.close();
+    reader.close();
+    delete[] hashLookup;
+    delete[] hashSeqPair;
+
+    if (subMat != NULL) {
+        delete subMat;
+    }
+
+    return EXIT_SUCCESS;
 }
diff --git a/src/util/convertalignments.cpp b/src/util/convertalignments.cpp
index 4d34ecb..f596904 100644
--- a/src/util/convertalignments.cpp
+++ b/src/util/convertalignments.cpp
@@ -164,14 +164,16 @@ int convertalignments(int argc, const char **argv, const Command &command) {
     NcbiTaxonomy * t = NULL;
     std::vector<std::pair<unsigned int, unsigned int>> mapping;
     if(needTaxonomy){
-        t = NcbiTaxonomy::openTaxonomy(par.db2);
+        std::string db2NoIndexName = PrefilteringIndexReader::dbPathWithoutIndex(par.db2);
+        t = NcbiTaxonomy::openTaxonomy(db2NoIndexName);
     }
     if(needTaxonomy || needTaxonomyMapping){
-        if(FileUtil::fileExists(std::string(par.db2 + "_mapping").c_str()) == false){
-            Debug(Debug::ERROR) << par.db2 + "_mapping" << " does not exist. Please create the taxonomy mapping!\n";
+        std::string db2NoIndexName = PrefilteringIndexReader::dbPathWithoutIndex(par.db2);
+        if(FileUtil::fileExists(std::string(db2NoIndexName + "_mapping").c_str()) == false){
+            Debug(Debug::ERROR) << db2NoIndexName + "_mapping" << " does not exist. Please create the taxonomy mapping!\n";
             EXIT(EXIT_FAILURE);
         }
-        bool isSorted = Util::readMapping( par.db2 + "_mapping", mapping);
+        bool isSorted = Util::readMapping( db2NoIndexName + "_mapping", mapping);
         if(isSorted == false){
             std::stable_sort(mapping.begin(), mapping.end(), compareToFirstInt);
         }
@@ -212,8 +214,8 @@ int convertalignments(int argc, const char **argv, const Command &command) {
         tDbrHeader = new IndexReader(par.db2, par.threads, IndexReader::SRC_HEADERS, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0);
     }
 
-    const bool queryNucs = Parameters::isEqualDbtype(qDbr.sequenceReader->getDbtype(), Parameters::DBTYPE_NUCLEOTIDES);
-    const bool targetNucs = Parameters::isEqualDbtype(tDbr->sequenceReader->getDbtype(), Parameters::DBTYPE_NUCLEOTIDES);
+    bool queryNucs = Parameters::isEqualDbtype(qDbr.sequenceReader->getDbtype(), Parameters::DBTYPE_NUCLEOTIDES);
+    bool targetNucs = Parameters::isEqualDbtype(tDbr->sequenceReader->getDbtype(), Parameters::DBTYPE_NUCLEOTIDES);
     if (needSequenceDB) {
         // try to figure out if search was translated. This is can not be solved perfectly.
         bool seqtargetAA = false;
@@ -233,17 +235,16 @@ int convertalignments(int argc, const char **argv, const Command &command) {
         }
     }
 
+    int gapOpen, gapExtend;
     SubstitutionMatrix * subMat= NULL;
     if (targetNucs == true && queryNucs == true && isTranslatedSearch == false) {
         subMat = new NucleotideMatrix(par.scoringMatrixFile.nucleotides, 1.0, 0.0);
-        if(par.PARAM_GAP_OPEN.wasSet==false){
-            par.gapOpen = 5;
-        }
-        if(par.PARAM_GAP_EXTEND.wasSet==false){
-            par.gapExtend = 2;
-        }
+        gapOpen = par.gapOpen.nucleotides;
+        gapExtend = par.gapExtend.nucleotides;
     }else{
         subMat = new SubstitutionMatrix(par.scoringMatrixFile.aminoacids, 2.0, 0.0);
+        gapOpen = par.gapOpen.aminoacids;
+        gapExtend = par.gapExtend.aminoacids;
     }
     EvalueComputation *evaluer = NULL;
     bool queryProfile = false;
@@ -251,7 +252,7 @@ int convertalignments(int argc, const char **argv, const Command &command) {
     if (needSequenceDB) {
         queryProfile = Parameters::isEqualDbtype(qDbr.sequenceReader->getDbtype(), Parameters::DBTYPE_HMM_PROFILE);
         targetProfile = Parameters::isEqualDbtype(tDbr->sequenceReader->getDbtype(), Parameters::DBTYPE_HMM_PROFILE);
-        evaluer = new EvalueComputation(tDbr->sequenceReader->getAminoAcidDBSize(), subMat, par.gapOpen, par.gapExtend);
+        evaluer = new EvalueComputation(tDbr->sequenceReader->getAminoAcidDBSize(), subMat, gapOpen, gapExtend);
     }
 
     DBReader<unsigned int> alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
@@ -324,7 +325,7 @@ int convertalignments(int argc, const char **argv, const Command &command) {
         queryProfData.reserve(1024);
 
         std::string queryBuffer;
-        queryProfData.reserve(1024);
+        queryBuffer.reserve(1024);
 
         std::string queryHeaderBuffer;
         queryHeaderBuffer.reserve(1024);
@@ -447,6 +448,7 @@ int convertalignments(int argc, const char **argv, const Command &command) {
                                 mappingIt = std::upper_bound(mapping.begin(), mapping.end(), val, compareToFirstInt);
                                 if (mappingIt == mapping.end() || mappingIt->first != val.first) {
                                     taxon = 0;
+                                    taxonNode = NULL;
                                 }else{
                                     taxon = mappingIt->second;
                                     if(needTaxonomy){
@@ -511,6 +513,10 @@ int convertalignments(int argc, const char **argv, const Command &command) {
                                         result.append(SSTR(res.score));
                                         break;
                                     case Parameters::OUTFMT_CIGAR:
+                                        if(isTranslatedSearch == true && targetNucs == true && queryNucs == true ){
+                                            Matcher::result_t::protein2nucl(res.backtrace, newBacktrace);
+                                            res.backtrace = newBacktrace;
+                                        }
                                         result.append(SSTR(res.backtrace));
                                         newBacktrace.clear();
                                         break;
@@ -631,7 +637,14 @@ int convertalignments(int argc, const char **argv, const Command &command) {
                             continue;
                         }
                         result.append(buffer, count);
-                        result.append(res.backtrace);
+                        if (isTranslatedSearch == true && targetNucs == true && queryNucs == true) {
+                            Matcher::result_t::protein2nucl(res.backtrace, newBacktrace);
+                            result.append(newBacktrace);
+                            newBacktrace.clear();
+
+                        } else {
+                            result.append(res.backtrace);
+                        }
                         result.append("\t*\t0\t0\t");
                         int start = std::min(res.qStartPos, res.qEndPos);
                         int end   = std::max(res.qStartPos, res.qEndPos);
diff --git a/src/util/convertmsa.cpp b/src/util/convertmsa.cpp
index 52bc61b..765664c 100644
--- a/src/util/convertmsa.cpp
+++ b/src/util/convertmsa.cpp
@@ -45,6 +45,8 @@ int convertmsa(int argc, const char **argv, const Command &command) {
     std::string identifier;
     std::string result;
     result.reserve(10 * 1024 * 1024);
+
+    Debug::Progress progress;
     while (std::getline(*in, line)) {
         size_t lineLength = line.length();
         if (lineLength < 1) {
@@ -52,6 +54,7 @@ int convertmsa(int argc, const char **argv, const Command &command) {
         }
 
         if (inEntry == false && line == "# STOCKHOLM 1.0") {
+            progress.updateProgress();
             inEntry = true;
             continue;
         }
diff --git a/src/util/convertprofiledb.cpp b/src/util/convertprofiledb.cpp
index dc1b239..29c7817 100644
--- a/src/util/convertprofiledb.cpp
+++ b/src/util/convertprofiledb.cpp
@@ -84,7 +84,7 @@ void parseHMM(char *data, std::string *sequence, std::string *header, char *prof
 //                truncPssmVal       =  std::max(-128.0f, truncPssmVal);
                 // rounding
 //                profileBuffer[curr_pos]  = static_cast<char>((truncPssmVal < 0.0) ? truncPssmVal - 0.5 : truncPssmVal + 0.5);
-//                Debug(Debug::INFO) << aa_num << " " << subMat->int2aa[aa_num] << " " << profile_score[pos_in_profile] << " " << score << " " << entry << " " << p << " " << backProb << " " << bitFactor << std::endl;
+//                Debug(Debug::INFO) << aa_num << " " << subMat->num2aa[aa_num] << " " << profile_score[pos_in_profile] << " " << score << " " << entry << " " << p << " " << backProb << " " << bitFactor << std::endl;
             }
             // shifted score by -128 to avoid \0
             profileBuffer[curr_pos] = Sequence::scoreMask(probs[aa_num]);
@@ -108,7 +108,7 @@ void parseHMM(char *data, std::string *sequence, std::string *header, char *prof
             }
         }
         // write query, consensus and neff
-        profileBuffer[curr_pos] = static_cast<char>(subMat->aa2int[(int) sequence->at(seq_pos)]);
+        profileBuffer[curr_pos] = subMat->aa2num[static_cast<int>(sequence->at(seq_pos))];
         curr_pos++;
         profileBuffer[curr_pos] = maxa;
         curr_pos++;
diff --git a/src/util/countkmer.cpp b/src/util/countkmer.cpp
index fcb36f6..3f830a2 100644
--- a/src/util/countkmer.cpp
+++ b/src/util/countkmer.cpp
@@ -48,14 +48,9 @@ int countkmer(int argc, const char **argv, const Command& command) {
         for (size_t i = 0; i < reader.sequenceReader->getSize(); i++) {
             char *data = reader.sequenceReader->getData(i, 0);
             s.mapSequence(i, 0, data, reader.sequenceReader->getSeqLen(i));
-            const int xIndex = s.subMat->aa2int[(int) 'X'];
             while (s.hasNextKmer()) {
-                const int *kmer = s.nextKmer();
-                int xCount = 0;
-                for (int pos = 0; pos < par.kmerSize; pos++) {
-                    xCount += (kmer[pos] == xIndex);
-                }
-                if (xCount > 0) {
+                const unsigned char *kmer = s.nextKmer();
+                if(s.kmerContainsX()){
                     continue;
                 }
 
@@ -72,7 +67,7 @@ int countkmer(int argc, const char **argv, const Command& command) {
         }else{
             idx.index2int(idx.workspace, i, par.kmerSize);
             for(int k = 0; k < par.kmerSize; k++){
-                std::cout << subMat->int2aa[idx.workspace[k]];
+                std::cout << subMat->num2aa[idx.workspace[k]];
             }
         }
         std::cout << "\t" << kmerCountTable[i] << std::endl;
diff --git a/src/util/createdb.cpp b/src/util/createdb.cpp
index 0a1156a..5df0014 100644
--- a/src/util/createdb.cpp
+++ b/src/util/createdb.cpp
@@ -20,52 +20,17 @@ int createdb(int argc, const char **argv, const Command& command) {
     std::string dataFile = filenames.back();
     filenames.pop_back();
     for (size_t i = 0; i < filenames.size(); i++) {
-        if (FileUtil::fileExists(filenames[i].c_str()) == false) {
-            Debug(Debug::ERROR) << "File " << filenames[i] << " does not exist.\n";
-            EXIT(EXIT_FAILURE);
-        }
         if (FileUtil::directoryExists(filenames[i].c_str()) == true) {
             Debug(Debug::ERROR) << "File " << filenames[i] << " is a directory.\n";
             EXIT(EXIT_FAILURE);
         }
     }
 
-    KSeqWrapper *kseq = KSeqFactory(filenames[0].c_str());
-    // check what kind of datbase it is
-    bool isNuclDb = (par.dbType == 2) ? true : false;
-    if (par.dbType == 0) {
-        size_t isNuclCnt = 0;
-        if (kseq->ReadEntry()) {
-            const KSeqWrapper::KSeqEntry &e = kseq->entry;
-
-            size_t cnt = 0;
-            for (size_t i = 0; i < e.sequence.l; i++) {
-                switch (toupper(e.sequence.s[i])) {
-                    case 'T':
-                    case 'A':
-                    case 'G':
-                    case 'C':
-                    case 'U':
-                    case 'N':
-                        cnt++;
-                        break;
-                }
-            }
-            float nuclDNAFraction = static_cast<float>(cnt) / static_cast<float>(e.sequence.l);
-            if (nuclDNAFraction > 0.9) {
-                isNuclCnt += true;
-            }
-        }
-        if (isNuclCnt) {
-            isNuclDb = true;
-        }
-    }
-    delete kseq;
-
-
-    int dbType = Parameters::DBTYPE_AMINO_ACIDS;
-    if (par.dbType == 2 || (par.dbType == 0 && isNuclDb == true)) {
+    int dbType = -1;
+    if (par.dbType == 2) {
         dbType = Parameters::DBTYPE_NUCLEOTIDES;
+    } else if(par.dbType == 1){
+        dbType = Parameters::DBTYPE_AMINO_ACIDS;
     }
 
     std::string indexFile = dataFile + ".index";
@@ -74,6 +39,13 @@ int createdb(int argc, const char **argv, const Command& command) {
         Debug(Debug::WARNING) << "We recompute with --shuffle 0.\n";
         par.shuffleDatabase = false;
     }
+
+    if (par.createdbMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT && par.filenames[0] == "stdin") {
+        Debug(Debug::WARNING) << "Createdb-mode 0 can not be combined with stdin input.\n";
+        Debug(Debug::WARNING) << "We recompute with --createdb-mode 1.\n";
+        par.createdbMode = Parameters::SEQUENCE_SPLIT_MODE_HARD;
+    }
+
     const unsigned int shuffleSplits = par.shuffleDatabase ? 32 : 1;
     if (par.createdbMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT && par.compressed) {
         Debug(Debug::WARNING) << "Compressed database can not be combined with --createdb-mode 0.\n";
@@ -85,7 +57,6 @@ int createdb(int argc, const char **argv, const Command& command) {
     std::string hdrIndexFile = dataFile + "_h.index";
 
     unsigned int entries_num = 0;
-    size_t count = 0;
     size_t sampleCount = 0;
 
     const char newline = '\n';
@@ -109,7 +80,7 @@ int createdb(int argc, const char **argv, const Command& command) {
     }
     DBWriter hdrWriter(hdrDataFile.c_str(), hdrIndexFile.c_str(), shuffleSplits, par.compressed, Parameters::DBTYPE_GENERIC_DB);
     hdrWriter.open();
-    DBWriter seqWriter(dataFile.c_str(), indexFile.c_str(), shuffleSplits, par.compressed, dbType);
+    DBWriter seqWriter(dataFile.c_str(), indexFile.c_str(), shuffleSplits, par.compressed, (dbType == -1) ? Parameters::DBTYPE_OMIT_FILE : dbType );
     seqWriter.open();
     for (size_t fileIdx = 0; fileIdx < filenames.size(); fileIdx++) {
         unsigned int numEntriesInCurrFile = 0;
@@ -124,7 +95,7 @@ int createdb(int argc, const char **argv, const Command& command) {
             EXIT(EXIT_FAILURE);
         }
 
-        kseq = KSeqFactory(filenames[fileIdx].c_str());
+        KSeqWrapper* kseq = KSeqFactory(filenames[fileIdx].c_str());
         while (kseq->ReadEntry()) {
             progress.updateProgress();
             const KSeqWrapper::KSeqEntry &e = kseq->entry;
@@ -134,21 +105,24 @@ int createdb(int argc, const char **argv, const Command& command) {
             }
 
             // header
-            header.append(e.name.s, e.name.l);
-            if (e.comment.l > 0) {
-                header.append(" ", 1);
-                header.append(e.comment.s, e.comment.l);
-            }
+            if(par.createdbMode == Parameters::SEQUENCE_SPLIT_MODE_HARD){
+                header.append(e.name.s, e.name.l);
+                if (e.comment.l > 0) {
+                    header.append(" ", 1);
+                    header.append(e.comment.s, e.comment.l);
+                }
 
-            std::string headerId = Util::parseFastaHeader(header.c_str());
-            if (headerId.empty()) {
-                // An identifier is necessary for these two cases, so we should just give up
-                Debug(Debug::WARNING) << "Can not extract identifier from entry " << entries_num << ".\n";
+                std::string headerId = Util::parseFastaHeader(header.c_str());
+                if (headerId.empty()) {
+                    // An identifier is necessary for these two cases, so we should just give up
+                    Debug(Debug::WARNING) << "Can not extract identifier from entry " << entries_num << ".\n";
+                }
+                header.push_back('\n');
             }
             unsigned int id = par.identifierOffset + entries_num;
-            if (par.dbType == 0) {
+            if (dbType == -1) {
                 // check for the first 10 sequences if they are nucleotide sequences
-                if (count < 10 || (count % 100) == 0) {
+                if (sampleCount < 10 || (sampleCount % 100) == 0) {
                     if (sampleCount < testForNucSequence) {
                         size_t cnt = 0;
                         for (size_t i = 0; i < e.sequence.l; i++) {
@@ -170,22 +144,10 @@ int createdb(int argc, const char **argv, const Command& command) {
                     }
                     sampleCount++;
                 }
-                bool redoComp = false;
-                if (isNuclCnt == sampleCount || isNuclCnt == testForNucSequence) {
-                    isNuclDb = true;
-                } else if (isNuclDb == true && isNuclCnt != sampleCount) {
-                    Debug(Debug::WARNING) << "Database does not look like a DNA database anymore.\n";
-                    Debug(Debug::WARNING) << "We recompute as protein database.\n";
-                    dbType = Parameters::DBTYPE_AMINO_ACIDS;
-                    redoComp = true;
-                }
                 if(par.createdbMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT && e.multiline == true){
                     Debug(Debug::WARNING) << "Multiline fasta can not be combined with --createdb-mode 0.\n";
                     Debug(Debug::WARNING) << "We recompute with --createdb-mode 1.\n";
                     par.createdbMode = Parameters::SEQUENCE_SPLIT_MODE_HARD;
-                    redoComp = true;
-                }
-                if (redoComp) {
                     progress.reset(SIZE_MAX);
                     hdrWriter.close();
                     seqWriter.close();
@@ -201,11 +163,10 @@ int createdb(int argc, const char **argv, const Command& command) {
             // Finally write down the entry
             unsigned int splitIdx = id % shuffleSplits;
             sourceLookup[splitIdx].emplace_back(fileIdx);
-            header.push_back('\n');
             if(par.createdbMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT){
                 // +2 to emulate the \n\0
-                hdrWriter.writeIndexEntry(id, e.offset, header.size()+2, 0);
-                seqWriter.writeIndexEntry(id, e.offset + header.size(), e.sequence.l+2, 0);
+                hdrWriter.writeIndexEntry(id, e.headerOffset, (e.sequenceOffset-e.headerOffset)+1, 0);
+                seqWriter.writeIndexEntry(id, e.sequenceOffset, e.sequence.l+2, 0);
             }else{
                 hdrWriter.writeData(header.c_str(), header.length(), id, splitIdx);
                 seqWriter.writeStart(splitIdx);
@@ -216,7 +177,6 @@ int createdb(int argc, const char **argv, const Command& command) {
 
             entries_num++;
             numEntriesInCurrFile++;
-            count++;
             header.clear();
         }
         delete kseq;
@@ -225,6 +185,16 @@ int createdb(int argc, const char **argv, const Command& command) {
     fclose(source);
     hdrWriter.close(true);
     seqWriter.close(true);
+    if(dbType == -1) {
+        if (isNuclCnt == sampleCount) {
+            dbType = Parameters::DBTYPE_NUCLEOTIDES;
+        } else {
+            dbType = Parameters::DBTYPE_AMINO_ACIDS;
+        }
+        seqWriter.writeDbtypeFile(seqWriter.getDataFileName(), dbType ,par.compressed);
+    }
+    Debug(Debug::INFO) << "Database type: " << Parameters::getDbTypeName(dbType) << "\n";
+
 
 
     if(entries_num == 0){
@@ -239,13 +209,16 @@ int createdb(int argc, const char **argv, const Command& command) {
     }
 
     // fix ids
-    DBWriter::createRenumberedDB(dataFile, indexFile, "", DBReader<unsigned int>::LINEAR_ACCCESS);
-    DBWriter::createRenumberedDB(hdrDataFile, hdrIndexFile, "", DBReader<unsigned int>::LINEAR_ACCCESS);
+    DBWriter::createRenumberedDB(dataFile, indexFile, "", "", DBReader<unsigned int>::LINEAR_ACCCESS);
+    DBWriter::createRenumberedDB(hdrDataFile, hdrIndexFile, "", "", DBReader<unsigned int>::LINEAR_ACCCESS);
     if(par.createdbMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT) {
-        for (size_t fileIdx = 0; fileIdx < filenames.size(); fileIdx++) {
-            if(par.sequenceSplitMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT){
-                FileUtil::symlinkAbs(filenames[0], dataFile+"."+SSTR(fileIdx));
-                FileUtil::symlinkAbs(filenames[0], hdrDataFile+"."+SSTR(fileIdx));
+        if(filenames.size() == 1){
+            FileUtil::symlinkAbs(filenames[0], dataFile);
+            FileUtil::symlinkAbs(filenames[0], hdrDataFile);
+        }else{
+            for (size_t fileIdx = 0; fileIdx < filenames.size(); fileIdx++) {
+                FileUtil::symlinkAbs(filenames[fileIdx], dataFile+"."+SSTR(fileIdx));
+                FileUtil::symlinkAbs(filenames[fileIdx], hdrDataFile+"."+SSTR(fileIdx));
             }
         }
     }
diff --git a/src/util/createseqfiledb.cpp b/src/util/createseqfiledb.cpp
index f0c90f6..d800f1a 100644
--- a/src/util/createseqfiledb.cpp
+++ b/src/util/createseqfiledb.cpp
@@ -1,5 +1,3 @@
-#include <sstream>
-
 #include "DBReader.h"
 #include "DBWriter.h"
 #include "Debug.h"
@@ -10,94 +8,98 @@
 #include <omp.h>
 #endif
 
-int createseqfiledb(int argc, const char **argv, const Command& command) {
-    Parameters& par = Parameters::getInstance();
+int createseqfiledb(int argc, const char **argv, const Command &command) {
+    Parameters &par = Parameters::getInstance();
     par.parseParameters(argc, argv, command, true, 0, 0);
 
-    DBReader<unsigned int> clusters(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
-    clusters.open(DBReader<unsigned int>::LINEAR_ACCCESS);
+    DBReader<unsigned int> headerDb(par.hdr1.c_str(), par.hdr1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
+    headerDb.open(DBReader<unsigned int>::NOSORT);
+    if (par.preloadMode != Parameters::PRELOAD_MODE_MMAP) {
+        headerDb.readMmapedDataInMemory();
+    }
 
-    DBReader<unsigned int> bodies(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
-    bodies.open(DBReader<unsigned int>::NOSORT);
-    bodies.readMmapedDataInMemory();
+    DBReader<unsigned int> seqDb(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
+    seqDb.open(DBReader<unsigned int>::NOSORT);
+    if (par.preloadMode != Parameters::PRELOAD_MODE_MMAP) {
+        seqDb.readMmapedDataInMemory();
+    }
 
-    DBReader<unsigned int> headers(par.hdr1.c_str(), par.hdr1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
-    headers.open(DBReader<unsigned int>::NOSORT);
-    headers.readMmapedDataInMemory();
+    DBReader<unsigned int> resultDb(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
+    resultDb.open(DBReader<unsigned int>::LINEAR_ACCCESS);
 
-    DBWriter msaOut(par.db3.c_str(), par.db3Index.c_str(), static_cast<unsigned int>(par.threads), par.compressed, Parameters::DBTYPE_GENERIC_DB);
-    msaOut.open();
+    DBWriter writer(par.db3.c_str(), par.db3Index.c_str(), static_cast<unsigned int>(par.threads), par.compressed, Parameters::DBTYPE_GENERIC_DB);
+    writer.open();
 
-    const size_t numClusters = clusters.getSize();
+    Debug::Progress progress(resultDb.getSize());
 #pragma omp parallel
     {
         unsigned int thread_idx = 0;
 #ifdef OPENMP
         thread_idx = static_cast<unsigned int>(omp_get_thread_num());
 #endif
+        std::string result;
+        result.reserve(1024);
+
+        char dbKey[255];
 #pragma omp for schedule(dynamic, 100)
-        for (size_t i = 0; i < numClusters; ++i){
-            std::string resultStr;
-            char* data = clusters.getData(i, thread_idx);
+        for (size_t i = 0; i < resultDb.getSize(); ++i) {
+            progress.updateProgress();
 
-            size_t entries = Util::countLines(data, clusters.getEntryLen(i) - 1);
+            unsigned int key = resultDb.getDbKey(i);
+            char *data = resultDb.getData(i, thread_idx);
+
+            size_t entries = Util::countLines(data, resultDb.getEntryLen(i) - 1);
             if (entries < (unsigned int) par.minSequences || entries > (unsigned int) par.maxSequences) {
                 continue;
             }
 
-            std::string entry;
-            std::istringstream clusterEntries(data);
             size_t entries_num = 0;
-            char dbKey[255 + 1];
-            while (std::getline(clusterEntries, entry)) {
-                resultStr.clear();
+            while (*data != '\0') {
                 entries_num++;
-                Util::parseKey((char*)entry.c_str(), dbKey);
-                const unsigned int entryId = (unsigned int) strtoul(dbKey, NULL, 10);
-
-                char* header = headers.getDataByDBKey(entryId, thread_idx);
-                if (header == NULL) {
-                    Debug(Debug::WARNING) << "Entry " << entry << " does not contain a header!" << "\n";
-                    continue;
+                Util::parseKey(data, dbKey);
+                data = Util::skipLine(data);
+
+                const unsigned int memberKey = (unsigned int) strtoul(dbKey, NULL, 10);
+                size_t headerId = headerDb.getId(memberKey);
+                if (headerId == UINT_MAX) {
+                    Debug(Debug::ERROR) << "Entry " << key << " does not contain a sequence!" << "\n";
+                    EXIT(EXIT_FAILURE);
                 }
-
-                char* body = bodies.getDataByDBKey(entryId, thread_idx);
-                if (body == NULL) {
-                    Debug(Debug::WARNING) << "Entry " << entry << " does not contain a sequence!" << "\n";
-                    continue;
+                size_t seqId = seqDb.getId(memberKey);
+                if (seqId == UINT_MAX) {
+                    Debug(Debug::ERROR) << "Entry " << key << " does not contain a sequence!" << "\n";
+                    EXIT(EXIT_FAILURE);
                 }
-                size_t lineLen = Util::skipLine(header) - header;
-                std::string headerStr(header, lineLen);
-                lineLen = Util::skipLine(body) - body;
-                std::string bodyStr(body, lineLen);
-
                 if (entries_num == 1 && par.hhFormat) {
-                    std::string consensusHeader(headerStr);
-                    resultStr.push_back('#');
-                    resultStr.append(headerStr);
-                    resultStr.push_back('>');
-                    resultStr.append(Util::removeAfterFirstSpace(consensusHeader));
-                    resultStr.append("_consensus\n");
-                    resultStr.append(bodyStr);
-                    resultStr.push_back('>');
-                    resultStr.append(headerStr);
-                    resultStr.append(bodyStr);
+                    char *header = headerDb.getData(headerId, thread_idx);
+                    size_t headerLen = headerDb.getEntryLen(headerId) - 1;
+                    size_t accessionLen = Util::skipNoneWhitespace(header);
+                    char *sequence = seqDb.getData(headerId, thread_idx);
+                    size_t sequenceLen = seqDb.getEntryLen(headerId) - 1;
+                    result.append(1, '#');
+                    result.append(header, headerLen);
+                    result.append(1, '>');
+                    result.append(header, accessionLen);
+                    result.append("_consensus\n");
+                    result.append(sequence, seqDb.getEntryLen(headerId) - 1);
+                    result.append(1, '>');
+                    result.append(header, headerLen);
+                    result.append(sequence, sequenceLen);
                 } else {
-                    resultStr.push_back('>');
-                    resultStr.append(headerStr);
-                    resultStr.append(bodyStr);
+                    result.append(1, '>');
+                    result.append(headerDb.getData(headerId, thread_idx), headerDb.getEntryLen(headerId) - 1);
+                    result.append(seqDb.getData(headerId, thread_idx), seqDb.getEntryLen(headerId) - 1);
                 }
             }
-
-            unsigned int key = clusters.getDbKey(i);
-            msaOut.writeData(resultStr.c_str(), resultStr.length(), key, thread_idx);
+            writer.writeData(result.c_str(), result.length(), key, thread_idx);
+            result.clear();
         }
-    };
+    }
 
-    msaOut.close();
-    headers.close();
-    bodies.close();
-    clusters.close();
+    writer.close();
+    resultDb.close();
+    seqDb.close();
+    headerDb.close();
 
     return EXIT_SUCCESS;
 }
diff --git a/src/util/createsubdb.cpp b/src/util/createsubdb.cpp
index f7adccb..72eb401 100644
--- a/src/util/createsubdb.cpp
+++ b/src/util/createsubdb.cpp
@@ -4,7 +4,6 @@
 #include "DBWriter.h"
 #include "Debug.h"
 #include "Util.h"
-#include "MemoryMapped.h"
 
 #include <climits>
 
@@ -12,30 +11,30 @@ int createsubdb(int argc, const char **argv, const Command& command) {
     Parameters& par = Parameters::getInstance();
     par.parseParameters(argc, argv, command, true, 0, 0);
 
-    std::string file = par.db1Index;
-    if (FileUtil::fileExists(file.c_str()) == false) {
-        file = par.db1;
-        if (FileUtil::fileExists(file.c_str()) == false) {
-            Debug(Debug::ERROR) << "File " << file << " does not exist.\n";
+    FILE *orderFile = NULL;
+    if (FileUtil::fileExists(par.db1Index.c_str())) {
+        orderFile = fopen(par.db1Index.c_str(), "r");
+    } else {
+        if(FileUtil::fileExists(par.db1.c_str())){
+            orderFile = fopen(par.db1.c_str(), "r");
+        }else{
+            Debug(Debug::ERROR) << "File " << par.db1 << " does not exist.\n";
             EXIT(EXIT_FAILURE);
         }
     }
 
-    MemoryMapped order(file, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
-    char* data = (char *) order.getData();
-
     DBReader<unsigned int> reader(par.db2.c_str(), par.db2Index.c_str(), 1, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
     reader.open(DBReader<unsigned int>::NOSORT);
     const bool isCompressed = reader.isCompressed();
 
     DBWriter writer(par.db3.c_str(), par.db3Index.c_str(), 1, 0, Parameters::DBTYPE_OMIT_FILE);
     writer.open();
-
+    // getline reallocs automatic
+    char *line = NULL;
+    size_t len = 0;
     char dbKey[256];
-    while (*data != '\0') {
-        Util::parseKey(data, dbKey);
-        data = Util::skipLine(data);
-
+    while (getline(&line, &len, orderFile) != -1) {
+        Util::parseKey(line, dbKey);
         const unsigned int key = Util::fast_atoi<unsigned int>(dbKey);
         const size_t id = reader.getId(key);
         if (id >= UINT_MAX) {
@@ -73,8 +72,9 @@ int createsubdb(int argc, const char **argv, const Command& command) {
     DBWriter::writeDbtypeFile(par.db3.c_str(), reader.getDbtype(), isCompressed);
     DBReader<unsigned int>::softlinkDb(par.db2, par.db3, DBFiles::SEQUENCE_ANCILLARY);
 
+    free(line);
     reader.close();
-    order.close();
+    fclose(orderFile);
 
     return EXIT_SUCCESS;
 }
diff --git a/src/util/diffseqdbs.cpp b/src/util/diffseqdbs.cpp
index a268bdb..603a38a 100644
--- a/src/util/diffseqdbs.cpp
+++ b/src/util/diffseqdbs.cpp
@@ -51,7 +51,7 @@ int diffseqdbs(int argc, const char **argv, const Command &command) {
 
     // Fill up the hash tables for the old and new DB
     size_t indexSizeOld = oldReader.getSize();
-    // keys pairs are like : (headerID,key) where key is the ffindex key corresponding to the header
+    // key pairs contain (headerID, key) where key is the DB key corresponding to the header
     std::pair<std::string, unsigned int> *keysOld
             = new std::pair<std::string, unsigned int>[indexSizeOld];
 #pragma omp parallel
diff --git a/src/util/expandaln.cpp b/src/util/expandaln.cpp
index 900bb3c..e84e97b 100644
--- a/src/util/expandaln.cpp
+++ b/src/util/expandaln.cpp
@@ -26,24 +26,24 @@ void rescoreResultByBacktrace(Matcher::result_t &result, Sequence &qSeq, Sequenc
     const bool isQueryProf = Parameters::isEqualDbtype(qSeq.getSeqType(), Parameters::DBTYPE_HMM_PROFILE);
     const bool isTargetProf = Parameters::isEqualDbtype(tSeq.getSeqType(), Parameters::DBTYPE_HMM_PROFILE);
 //    for(int i = result.qStartPos; i < result.qEndPos; i++){
-//        printf("%c",subMat.int2aa[qSeq.int_sequence[i]]);
+//        printf("%c",subMat.num2aa[qSeq.sequence[i]]);
 //    }
 //    Debug(Debug::INFO) << "\n";
 //    for(int i = result.dbStartPos; i < result.dbEndPos; i++){
-//        printf("%c",subMat.int2aa[tSeq.int_sequence[i]]);
+//        printf("%c",subMat.num2aa[tSeq.sequence[i]]);
 //    }
 //    Debug(Debug::INFO) << "\n";
     for (size_t i = 0; i < result.backtrace.size(); ++i) {
         char state = result.backtrace[i];
         if (state == 'M') {
             if (isTargetProf) {
-                score += tSeq.profile_for_alignment[qSeq.int_sequence[qPos] * tSeq.L + tPos]  + static_cast<short>((compositionBias[i] < 0.0)? compositionBias[i] - 0.5: compositionBias[i] + 0.5);;
+                score += tSeq.profile_for_alignment[qSeq.numSequence[qPos] * tSeq.L + tPos]  + static_cast<short>((compositionBias[i] < 0.0)? compositionBias[i] - 0.5: compositionBias[i] + 0.5);;
             } else if (isQueryProf) {
-                score += qSeq.profile_for_alignment[tSeq.int_sequence[tPos] * qSeq.L + qPos];
+                score += qSeq.profile_for_alignment[tSeq.numSequence[tPos] * qSeq.L + qPos];
             } else {
-                score += subMat.subMatrix[qSeq.int_sequence[qPos]][tSeq.int_sequence[tPos]] + static_cast<short>((compositionBias[i] < 0.0)? compositionBias[i] - 0.5: compositionBias[i] + 0.5);
+                score += subMat.subMatrix[qSeq.numSequence[qPos]][tSeq.numSequence[tPos]] + static_cast<short>((compositionBias[i] < 0.0)? compositionBias[i] - 0.5: compositionBias[i] + 0.5);
             }
-            identities += qSeq.int_sequence[qPos] == tSeq.int_sequence[tPos] ? 1 : 0;
+            identities += qSeq.numSequence[qPos] == tSeq.numSequence[tPos] ? 1 : 0;
             qPos++;
             tPos++;
         } else if (state == 'I') {
@@ -136,7 +136,7 @@ int expandaln(int argc, const char **argv, const Command& command) {
 
     BacktraceTranslator translator;
     SubstitutionMatrix subMat(par.scoringMatrixFile.aminoacids, 2.0, par.scoreBias);
-    EvalueComputation evaluer(targetReader.getAminoAcidDBSize(), &subMat, par.gapOpen, par.gapExtend);
+    EvalueComputation evaluer(targetReader.getAminoAcidDBSize(), &subMat, par.gapOpen.aminoacids, par.gapExtend.aminoacids);
     Debug::Progress progress(resultReader->getSize());
 
     Debug(Debug::INFO) << "Computing expanded alignment result...\n";
@@ -171,7 +171,7 @@ int expandaln(int argc, const char **argv, const Command& command) {
                              queryReader.getSeqLen(querySeqId));
 
             if(par.compBiasCorrection == true && Parameters::isEqualDbtype(queryDbType,Parameters::DBTYPE_AMINO_ACIDS)){
-                SubstitutionMatrix::calcLocalAaBiasCorrection(&subMat, qSeq.int_sequence, qSeq.L, compositionBias);
+                SubstitutionMatrix::calcLocalAaBiasCorrection(&subMat, qSeq.numSequence, qSeq.L, compositionBias);
             }
 
             char *data = resultReader->getData(i, thread_idx);
@@ -200,7 +200,7 @@ int expandaln(int argc, const char **argv, const Command& command) {
                 }
                 for (size_t k = 0; k < expanded.size(); ++k) {
                     Matcher::result_t &resultBC = expanded[k];
-                    if (resultBC.backtrace.size() == 0) {
+                    if (resultBC.backtrace.empty()) {
                         Debug(Debug::ERROR) << "Alignment must contain a backtrace.\n";
                         EXIT(EXIT_FAILURE);
                     }
@@ -208,7 +208,7 @@ int expandaln(int argc, const char **argv, const Command& command) {
 //                    Debug(Debug::INFO) << buffer;
 
                     translator.translateResult(resultAB, resultBC, resultAC);
-                    if (resultAC.backtrace.size() == 0) {
+                    if (resultAC.backtrace.empty()) {
                         continue;
                     }
 
@@ -217,7 +217,7 @@ int expandaln(int argc, const char **argv, const Command& command) {
                     }
 
                     rescoreResultByBacktrace(resultAC, qSeq, tSeq, subMat, compositionBias,
-                                             evaluer, par.gapOpen, par.gapExtend, par.seqIdMode);
+                                             evaluer, par.gapOpen.aminoacids, par.gapExtend.aminoacids, par.seqIdMode);
 
                     if (Alignment::checkCriteria(resultAC, false, par.evalThr, par.seqIdThr, par.alnLenThr, par.covMode, par.covThr)) {
                         results.emplace_back(resultAC);
diff --git a/src/util/extractdomains.cpp b/src/util/extractdomains.cpp
index 0f978a7..62742f1 100644
--- a/src/util/extractdomains.cpp
+++ b/src/util/extractdomains.cpp
@@ -96,8 +96,8 @@ int scoreSubAlignment(std::string query, std::string target, unsigned int qStart
 //            std::cout << "tGap\t"  << query[qPos] << "\t" << target[tPos] << "\t" << rawScore << "\t" << rawScore << "\t" << maxScore << std::endl;
         } else {
 
-            int queryRes = qSeq.int_sequence[qPos];
-            int targetRes = tSeq.int_sequence[tPos];
+            int queryRes = qSeq.numSequence[qPos];
+            int targetRes = tSeq.numSequence[tPos];
             int matchScore = matrix.subMatrix[queryRes][targetRes];
             rawScore = std::max(0, rawScore + matchScore);
 //            std::cout << "Matc\t"  << queryAA << "\t" << targetAA << "\t" << matchScore << "\t" << rawScore << "\t" << maxScore << std::endl;
@@ -261,7 +261,7 @@ int doExtract(Parameters &par, DBReader<unsigned int> &blastTabReader,
             char *tabData = blastTabReader.getData(i, thread_idx);
             size_t tabLength = blastTabReader.getEntryLen(i) - 1;
             const std::vector<Domain> result = getEntries(std::string(tabData, tabLength));
-            if (result.size() == 0) {
+            if (result.empty()) {
                 Debug(Debug::WARNING) << "Can not map any entries for entry " << id << "!\n";
                 continue;
             }
diff --git a/src/util/extractframes.cpp b/src/util/extractframes.cpp
index f909d1c..94c4236 100644
--- a/src/util/extractframes.cpp
+++ b/src/util/extractframes.cpp
@@ -18,8 +18,6 @@
 
 int extractframes(int argc, const char **argv, const Command& command) {
     Parameters& par = Parameters::getInstance();
-    par.overrideParameterDescription((Command &)command, par.PARAM_ORF_FORWARD_FRAMES.uniqid, "comma-seperated list of frames on the forward strand to be extracted", NULL, par.PARAM_ORF_FORWARD_FRAMES.category);
-    par.overrideParameterDescription((Command &)command, par.PARAM_ORF_REVERSE_FRAMES.uniqid, "comma-seperated list of frames on the reverse strand to be extracted", NULL, par.PARAM_ORF_REVERSE_FRAMES.category);
     par.parseParameters(argc, argv, command, true, 0, 0);
 
     DBReader<unsigned int> reader(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
@@ -47,9 +45,8 @@ int extractframes(int argc, const char **argv, const Command& command) {
         if (querySize == 0) {
             queryFrom = 0;
         }
-        char buffer[LINE_MAX];
-
 
+        char buffer[1024];
         std::string reverseComplementStr;
         reverseComplementStr.reserve(32000);
         for (unsigned int i = queryFrom; i < (queryFrom + querySize); ++i){
@@ -84,15 +81,13 @@ int extractframes(int argc, const char **argv, const Command& command) {
                 bool hasWrongChar = false;
                 for(size_t pos = 0; pos < sequenceLength; ++pos) {
                     char reverseComplement = Orf::complement(data[sequenceLength - pos - 1]);
-                    reverseComplementStr.push_back(Orf::complement(data[sequenceLength - pos - 1]));
-                    if(reverseComplement == '.') {
-                        Debug(Debug::WARNING) << "Can not compute reverse sequence of  sequence with index " << i << "!\n";
-                        hasWrongChar = true;
-                    }
-                }
-                if(hasWrongChar == true){
-                    continue;
+                    reverseComplement = (reverseComplement == '.') ? 'N' : reverseComplement;
+                    reverseComplementStr.push_back(reverseComplement);
+                    hasWrongChar |= (reverseComplement == '.');
                 }
+//                if(hasWrongChar == true){
+//                    continue;
+//                }
                 reverseComplementStr.push_back('\n');
             }
 
@@ -127,13 +122,12 @@ int extractframes(int argc, const char **argv, const Command& command) {
         {
 #pragma omp task
             {
-                DBWriter::createRenumberedDB(par.hdr2, par.hdr2Index, "");
+                DBWriter::createRenumberedDB(par.hdr2, par.hdr2Index, "", "");
             }
 
 #pragma omp task
             {
-                std::string lookup = par.db1 + ".lookup";
-                DBWriter::createRenumberedDB(par.db2, par.db2Index, par.createLookup ? lookup : "");
+                DBWriter::createRenumberedDB(par.db2, par.db2Index, par.createLookup ? par.db1 : "", par.createLookup ? par.db1Index : "");
             }
         }
     }
diff --git a/src/util/extractorfs.cpp b/src/util/extractorfs.cpp
index d2338d5..6cfadc9 100644
--- a/src/util/extractorfs.cpp
+++ b/src/util/extractorfs.cpp
@@ -60,6 +60,7 @@ int extractorfs(int argc, const char **argv, const Command& command) {
             queryFrom = 0;
         }
         char* aa = new char[par.maxSeqLen + 3 + 1];
+        char buffer[1024];
 
         std::vector<Orf::SequenceLocation> res;
         res.reserve(1000);
@@ -87,8 +88,6 @@ int extractorfs(int argc, const char **argv, const Command& command) {
                     continue;
                 }
 
-                char buffer[LINE_MAX];
-
                 std::pair<const char*, size_t> sequence = orf.getSequence(loc);
                 size_t fromPos = loc.from;
                 size_t toPos = loc.to;
@@ -139,13 +138,12 @@ int extractorfs(int argc, const char **argv, const Command& command) {
         {
 #pragma omp task
             {
-                DBWriter::createRenumberedDB(par.hdr2, par.hdr2Index, "");
+                DBWriter::createRenumberedDB(par.hdr2, par.hdr2Index, "", "");
             }
 
 #pragma omp task
             {
-                std::string lookup = par.db1 + ".lookup";
-                DBWriter::createRenumberedDB(par.db2, par.db2Index, par.createLookup ? lookup : "");
+                DBWriter::createRenumberedDB(par.db2, par.db2Index, par.createLookup ? par.db1 : "", par.createLookup ? par.db1Index : "");
             }
         }
     }
diff --git a/src/util/filterdb.cpp b/src/util/filterdb.cpp
index e20a480..3160e11 100644
--- a/src/util/filterdb.cpp
+++ b/src/util/filterdb.cpp
@@ -3,561 +3,517 @@
 #include "DBWriter.h"
 #include "Util.h"
 #include "Debug.h"
-#include "filterdb.h"
 #include "FileUtil.h"
+#include "ExpressionParser.h"
 
 #include <fstream>
-#include <iostream>
-#include <algorithm>
 
 #include <omptl/omptl_algorithm>
+#include <regex.h>
 
 #ifdef OPENMP
 #include <omp.h>
 #endif
 
+#define REGEX_FILTERING       0
+#define FILE_FILTERING        1
+#define FILE_MAPPING          2
+#define GET_FIRST_LINES       3
+#define NUMERIC_COMPARISON    4
+#define SORT_ENTRIES          5
+#define BEATS_FIRST           6
+#define JOIN_DB               7
+#define EXPRESSION_FILTERING 10
+
+
+enum ComparisonOperator {
+    OP_GEQ,
+    OP_LEQ,
+    OP_EQ,
+    OP_IN_P,
+    OP_OUT_P,
+    OP_EQ_P,
+
+    OP_INVALID
+};
+
+ComparisonOperator mapOperator(const std::string& op) {
+    if (op == "ge") return OP_GEQ;
+    if (op == "le") return OP_LEQ;
+    if (op == "e")  return OP_EQ;
+    if (op == "ip") return OP_IN_P;
+    if (op == "op") return OP_OUT_P;
+    if (op == "ep") return OP_EQ_P;
+    return OP_INVALID;
+}
+
+#define INCREASING 1
+#define DECREASING 2
+#define SHUFFLE    3
+
+struct compareString {
+    bool operator() (const std::string& lhs, const std::string& rhs) const{
+        return (lhs.compare(rhs) <= 0);
+    }
+};
+
+struct compareFirstString {
+    bool operator() (const std::pair<std::string, std::string>& lhs, const std::pair<std::string,std::string>& rhs) const{
+        return (lhs.first.compare(rhs.first) <= 0);
+    }
+};
+
+struct compareToFirstString {
+    bool operator() (const std::pair<std::string,std::string>& lhs,const std::string& rhs) const{
+        return (lhs.first.compare(rhs) < 0);
+    }
+};
 
 struct compareFirstEntry {
-    bool operator()(const std::pair <double,std::string> &lhs,
-                    const std::pair <double,std::string> &rhs) const {
+    bool operator()(const std::pair<double, std::string> &lhs, const std::pair<double, std::string> &rhs) const {
         return (lhs.first < rhs.first);
     }
 };
 
-
 struct compareFirstEntryDecreasing {
-    bool operator()(const std::pair <double,std::string> &lhs,
-                    const std::pair <double,std::string> &rhs) const {
+    bool operator()(const std::pair<double, std::string> &lhs, const std::pair<double, std::string> &rhs) const {
         return (lhs.first > rhs.first);
     }
 };
 
+int filterdb(int argc, const char **argv, const Command &command) {
+    Parameters &par = Parameters::getInstance();
+    par.parseParameters(argc, argv, command, true, 0, 0);
 
+    const size_t column = static_cast<size_t>(par.filterColumn);
+    const int columnToTake = par.columnToTake;
+    const bool trimToOneColumn = par.trimToOneColumn;
+    // positiveFilter = true => outDB = inDB \intersect filter ; othw : outDB = inDB - filter
+    const bool positiveFiltering = par.positiveFilter;
+    const bool shouldAddSelfMatch = par.includeIdentity;
+    const ComparisonOperator compOperator = mapOperator(par.compOperator);
 
-int ffindexFilter::initFiles() {
-	dataDb=new DBReader<unsigned int>(inDB.c_str(),(std::string(inDB).append(".index")).c_str(), threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
-	dataDb->open(DBReader<unsigned int>::LINEAR_ACCCESS);
-
-	dbw = new DBWriter(outDB.c_str(), (std::string(outDB).append(".index")).c_str(), threads, compressed, dataDb->getDbtype());
-	dbw->open();
-	return 0;
-}
+    DBReader<unsigned int> reader(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
+    reader.open(DBReader<unsigned int>::LINEAR_ACCCESS);
 
+    DBWriter writer(par.db2.c_str(), par.db2Index.c_str(), par.threads, par.compressed, reader.getDbtype());
+    writer.open();
 
+    // FILE_FILTERING
+    std::vector<std::string> filter;
 
+    // FILE_MAPPING
+    std::vector<std::pair<std::string, std::string>> mapping;
 
-ffindexFilter::ffindexFilter(Parameters &par) {
-    inDB = std::string(par.db1);
-    outDB = std::string(par.db2);
-    threads = par.threads;
-    compressed = par.compressed;
-    column  = static_cast<size_t>(par.filterColumn);
-    columnToTake = par.columnToTake;
-    trimToOneColumn = par.trimToOneColumn;
-    positiveFiltering = par.positiveFilter;
-    shouldAddSelfMatch = par.includeIdentity;
-    parser = NULL;
-    
-	initFiles();
+    // JOIN_DB
+    DBReader<unsigned int>* helper = NULL;
 
+    // REGEX_FILTERING
+    regex_t regex;
 
-    if (par.sortEntries) {
+    int mode;
+    if (par.sortEntries != 0) {
         mode = SORT_ENTRIES;
-        std::cout<<"Filtering by sorting entries."<<std::endl;
-        sortingMode = par.sortEntries;
-    } else if (par.filteringFile != "") {
+        Debug(Debug::INFO) << "Filtering by sorting entries\n";
+    } else if (par.filteringFile.empty() == false) {
         mode = FILE_FILTERING;
-        std::cout << "Filtering with filter files." << std::endl;
-        filterFile = par.filteringFile;
-        // Fill the filter with the keys contained in the file
+        Debug(Debug::INFO) << "Filtering using file(s)\n";
+        // Fill the filter with the data contained in the file
         std::vector<std::string> filenames;
-        if (FileUtil::fileExists(filterFile.c_str())) {
-            filenames.push_back(filterFile);
-        } else if (FileUtil::fileExists((filterFile + ".dbtype").c_str())) {
-            filenames = FileUtil::findDatafiles(filterFile.c_str());
+        if (FileUtil::fileExists(par.filteringFile.c_str())) {
+            filenames.push_back(par.filteringFile);
+        } else if (FileUtil::fileExists((par.filteringFile + ".dbtype").c_str())) {
+            filenames = FileUtil::findDatafiles(par.filteringFile.c_str());
         } else {
-            Debug(Debug::ERROR) << "File " << filterFile << " does not exist.\n";
+            Debug(Debug::ERROR) << "File " << par.filteringFile << " does not exist\n";
             EXIT(EXIT_FAILURE);
         }
-        char *line = NULL;
-        size_t len = 0;
-        char key[4096];
+        char key[65536];
         for (size_t i = 0; i < filenames.size(); i++) {
-            FILE *orderFile = fopen(filenames[i].c_str(), "r");
-            while (getline(&line, &len, orderFile) != -1) {
-                size_t offset = 0;
-                // ignore \0 in data files to support datafiles as input
-                while (offset < len && line[offset] == '\0') {
-                    offset++;
+            FILE * orderFile = fopen(filenames[i].c_str(), "r");
+            int c;
+            size_t offset = 0;
+            bool inKey = true;
+            // parse first column in each line without tripping over additional null bytes
+            // as we allow database data files as input
+            while ((c = fgetc(orderFile)) != EOF) {
+                if (c == '\n') {
+                    if (offset > 0) {
+                        key[offset] = '\0';
+                        offset = 0;
+                        filter.emplace_back(key);
+                    }
+                    inKey = true;
+                    continue;
+                }
+                if (c == ' ' || c == '\t') {
+                    inKey = false;
+                    continue;
                 }
-                if (offset >= len) {
-                    break;
+                if (c == '\0' || inKey == false) {
+                    continue;
                 }
-                Util::parseKey(line + offset, key);
+
+                key[offset] = c;
+                offset++;
+
+                if (offset == 65536) {
+                    Debug(Debug::ERROR) << "Input in file " << filenames[i] << " too long\n";
+                    EXIT(EXIT_FAILURE);
+                }
+            }
+            if (inKey == true && offset > 0) {
+                key[offset] = '\0';
                 filter.emplace_back(key);
             }
             fclose(orderFile);
         }
-        free(line);
         omptl::sort(filter.begin(), filter.end());
         std::vector<std::string>::iterator last = std::unique(filter.begin(), filter.end());
         filter.erase(last, filter.end());
-    } else if(par.mappingFile != "") {
+    } else if (par.mappingFile.empty() == false) {
         mode = FILE_MAPPING;
-        std::cout<<"Filtering by mapping values."<<std::endl;
-        filterFile = par.mappingFile;
-
-        // Fill the filter with the data contained in the file
-        std::ifstream filterFileStream;
-        filterFileStream.open(filterFile);
+        Debug(Debug::INFO) << "Filtering by mapping values\n";
+        std::ifstream ss(par.mappingFile);
         std::string line;
-        while (std::getline(filterFileStream,line))
-        {
-            std::string keyOld,keyNew;
+        std::string keyOld, keyNew;
+        while (std::getline(ss, line)) {
             std::istringstream lineToSplit(line);
-            std::getline(lineToSplit,keyOld,'\t');
-            std::getline(lineToSplit,keyNew,'\t');
-
-
-            mapping.push_back(std::make_pair(keyOld, keyNew));
+            std::getline(lineToSplit, keyOld, '\t');
+            std::getline(lineToSplit, keyNew, '\t');
+            mapping.emplace_back(keyOld, keyNew);
         }
         std::stable_sort(mapping.begin(), mapping.end(), compareFirstString());
-    } else if(par.extractLines > 0){ // GET_FIRST_LINES mode
+    } else if (par.extractLines > 0) {
         mode = GET_FIRST_LINES;
-        numberOfLines = par.extractLines;
-        std::cout << "Filtering by extracting the first " << numberOfLines << " lines.\n";
-    } else if(!par.joinDB.empty()){
+        Debug(Debug::INFO) << "Filtering by extracting the first " << par.extractLines << " lines\n";
+    } else if (par.joinDB.empty() == false) {
         mode = JOIN_DB;
-        std::string joinIndex(par.joinDB);
-        joinIndex.append(".index");
-        joinDB = new DBReader<unsigned int>(par.joinDB.c_str(), joinIndex.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
-        joinDB->open(DBReader<unsigned int>::NOSORT);
-        std::cout << "Joining targets to query database.\n";
-    } else if (!par.compPos.empty()) {
-        mode = COMPUTE_POSITIONS;
-        std::string swapIndex = par.compPos + ".index";
-        swapDB = new DBReader<unsigned int>(par.compPos.c_str(), swapIndex.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
-        swapDB->open(DBReader<unsigned int>::NOSORT);
-        std::string A = par.db3;
-        std::cout << "Swapping fields\n";
-    } else if (!par.clusterFile.empty()){
-        mode = TRANSITIVE_REPLACE;
-        std::string clusterIndex = par.clusterFile + ".index";
-        clusterDB = new DBReader<unsigned int>(par.clusterFile.c_str(), clusterIndex.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
-        clusterDB->open(DBReader<unsigned int>::NOSORT);
-        std::cout << "Replacing target Field by clusters Genes\n";
-    } else if (par.beatsFirst){
+        std::string joinIndex = par.joinDB + ".index";
+        helper = new DBReader<unsigned int>(par.joinDB.c_str(), joinIndex.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
+        helper->open(DBReader<unsigned int>::NOSORT);
+        Debug(Debug::INFO) << "Joining databases by column value\n";
+    } else if (par.beatsFirst == true) {
         mode = BEATS_FIRST;
-        std::cout << "Filter by numerical comparison to first row.\n";
-        compOperator = par.compOperator;
-    } else if(par.compOperator != "") {
+        Debug(Debug::INFO) << "Filtering by numerical comparison to first row\n";
+    } else if (par.compOperator.empty() == false) {
         mode = NUMERIC_COMPARISON;
-        std::cout << "Filtering by numerical comparison.\n";
-        compValue = par.compValue;
-        compOperator = par.compOperator;
-    } else if (par.filterExpression != "") {
+        Debug(Debug::INFO) << "Filtering by numerical comparison\n";
+    } else if (par.filterExpression.empty() == false) {
         mode = EXPRESSION_FILTERING;
-        parser = new ExpressionParser(par.filterExpression.c_str());
-        if (parser->isOk() == false){
-            Debug(Debug::INFO) << "Error in expression " << par.filterExpression << "\n";
-            EXIT(EXIT_FAILURE);
-        }
-        bindableParserColumns = parser->findBindableIndices();
     } else {
         mode = REGEX_FILTERING;
-        std::cout << "Filtering by RegEx.\n";
-        regexStr = par.filterColumnRegex;
-        int status = regcomp(&regex, regexStr.c_str(), REG_EXTENDED | REG_NEWLINE);
-        if (status != 0 ){
-            Debug(Debug::ERROR) << "Error in regex " << regexStr << "\n";
+        Debug(Debug::INFO) << "Filtering using regular expression\n";
+        int status = regcomp(&regex, par.filterColumnRegex.c_str(), REG_EXTENDED | REG_NEWLINE);
+        if (status != 0) {
+            Debug(Debug::ERROR) << "Error in regex " << par.filterColumnRegex << "\n";
             EXIT(EXIT_FAILURE);
         }
     }
-}
-    
-        
-ffindexFilter::~ffindexFilter() {
-	if (mode == REGEX_FILTERING)
-		regfree(&regex);
-	dataDb->close();
-	dbw->close();
-	delete dataDb;
-	delete dbw;
-}
-
-
-
-int ffindexFilter::runFilter(){
-	const size_t LINE_BUFFER_SIZE = 1000000;
-    Debug::Progress progress(dataDb->getSize());
 
+    const size_t LINE_BUFFER_SIZE = 1000000;
+    Debug::Progress progress(reader.getSize());
 #pragma omp parallel
-	{
+    {
         int thread_idx = 0;
 #ifdef OPENMP
         thread_idx = omp_get_thread_num();
 #endif
 
-		char *lineBuffer = new char[LINE_BUFFER_SIZE];
-		char *columnValue = new char[LINE_BUFFER_SIZE];
-		const char **columnPointer = new const char*[column + 1];
+        char *lineBuffer = new char[LINE_BUFFER_SIZE];
+        char *columnValue = new char[LINE_BUFFER_SIZE];
+        const char **columnPointer = new const char *[column + 1];
+
+        char *newLineBuffer = new char[LINE_BUFFER_SIZE];
+
+        double referenceValue = 0;
+
+        std::string buffer = "";
+        buffer.reserve(LINE_BUFFER_SIZE);
+
+        std::vector<std::pair<double, std::string>> toSort;
+
+        char dbKeyBuffer[255 + 1];
 
-		double threadCompValue = compValue;
+        // EXPRESSION_FILTERING
+        ExpressionParser* parser = NULL;
+        std::vector<int> bindableParserColumns;
 
-		std::string buffer = "";
-		buffer.reserve(LINE_BUFFER_SIZE);
+        if (mode == EXPRESSION_FILTERING) {
+            parser = new ExpressionParser(par.filterExpression.c_str());
+            if (parser->isOk() == false) {
+                Debug(Debug::INFO) << "Error in expression " << par.filterExpression << "\n";
+                EXIT(EXIT_FAILURE);
+            }
+            bindableParserColumns = parser->findBindableIndices();
+        }
 
 #pragma omp for schedule(dynamic, 10)
-		for (size_t id = 0; id < dataDb->getSize(); id++) {
-			progress.updateProgress();
-
-			char *data = dataDb->getData(id,  thread_idx);
-            unsigned int queryKey = dataDb->getDbKey(id);
-			size_t dataLength = dataDb->getEntryLen(id);
-			int counter = 0;
-            
-            std::vector<std::pair<double, std::string>> toSort;
+        for (size_t id = 0; id < reader.getSize(); ++id) {
+            progress.updateProgress();
+
+            char *data = reader.getData(id, thread_idx);
+            unsigned int queryKey = reader.getDbKey(id);
+            size_t dataLength = reader.getEntryLen(id);
+            int counter = 0;
+
             bool addSelfMatch = false;
 
-			while (*data != '\0') {
+            while (*data != '\0') {
                 if (shouldAddSelfMatch) {
-                    char dbKeyBuffer[255 + 1];
                     Util::parseKey(data, dbKeyBuffer);
                     const unsigned int curKey = (unsigned int) strtoul(dbKeyBuffer, NULL, 10);
                     addSelfMatch = (queryKey == curKey);
                 }
-                    
-				if(!Util::getLine(data, dataLength, lineBuffer, LINE_BUFFER_SIZE)) {
-					Debug(Debug::WARNING) << "Identifier was too long and was cut off!\n";
-					data = Util::skipLine(data);
-					continue;
-				}
+
+                if (!Util::getLine(data, dataLength, lineBuffer, LINE_BUFFER_SIZE)) {
+                    Debug(Debug::WARNING) << "Identifier was too long and was cut off!\n";
+                    data = Util::skipLine(data);
+                    continue;
+                }
 
                 counter++;
                 size_t foundElements = 1;
-                if (mode != GET_FIRST_LINES) {
+                if (mode != GET_FIRST_LINES || trimToOneColumn) {
                     foundElements = Util::getWordsOfLine(lineBuffer, columnPointer, column + 1);
-                    if(foundElements < column  ){
+                    if (foundElements < column) {
                         Debug(Debug::ERROR) << "Column=" << column << " does not exist in line " << lineBuffer << "\n";
                         EXIT(EXIT_FAILURE);
                     }
 
                     size_t colStrLen;
                     // if column is last column
-                    if(column == foundElements){
-                        const size_t entrySize = Util::skipNoneWhitespace(columnPointer[(column - 1)]); //Util::skipLine(data)
+                    if (column == foundElements) {
+                        const size_t entrySize = Util::skipNoneWhitespace(columnPointer[(column - 1)]);
                         memcpy(columnValue, columnPointer[column - 1], entrySize);
                         columnValue[entrySize] = '\0';
                         colStrLen = entrySize;
-                    }else{
+                    } else {
                         const ptrdiff_t entrySize = columnPointer[column] - columnPointer[(column - 1)];
                         memcpy(columnValue, columnPointer[column - 1], entrySize);
                         columnValue[entrySize] = '\0';
                         colStrLen = entrySize;
                     }
 
-                    columnValue[Util::getLastNonWhitespace(columnValue,colStrLen)] = '\0'; // remove the whitespaces at the end
+                    // remove the whitespaces at the end
+                    columnValue[Util::getLastNonWhitespace(columnValue, colStrLen)] = '\0';
                 }
 
-				int nomatch = 0;
-				if(mode == GET_FIRST_LINES){
-					nomatch = 0; // output the line
-					if(counter > numberOfLines){
-						nomatch = 1; // hide the line in the output
-					}
-				} else if (mode == NUMERIC_COMPARISON) {
+                int nomatch = 0;
+                if (mode == GET_FIRST_LINES) {
+                    // output the line
+                    nomatch = 0;
+                    if (counter > par.extractLines) {
+                        // hide the line in the output
+                        nomatch = 1;
+                    }
+                } else if (mode == NUMERIC_COMPARISON) {
                     double toCompare = strtod(columnValue, NULL);
-                    if (compOperator == GREATER_OR_EQUAL) {
-                        nomatch = !(toCompare >= threadCompValue); // keep if the comparison is true
-                    } else if (compOperator == LOWER_OR_EQUAL) {
-                        nomatch = !(toCompare <= threadCompValue); // keep if the comparison is true
-                    } else if (compOperator == EQUAL) {
-                        nomatch = !(toCompare == threadCompValue); // keep if the comparison is true
+                    if (compOperator == OP_GEQ) {
+                        nomatch = !(toCompare >= par.compValue);
+                    } else if (compOperator == OP_LEQ) {
+                        nomatch = !(toCompare <= par.compValue);
+                    } else if (compOperator == OP_EQ) {
+                        nomatch = !(toCompare == par.compValue);
                     } else {
                         nomatch = 0;
                     }
                 } else if (mode == EXPRESSION_FILTERING) {
-				    const char* columnPointers[128];
+                    const char *columnPointers[128];
                     Util::getWordsOfLine(lineBuffer, columnPointers, 128);
-				    for (size_t i = 0; i < bindableParserColumns.size(); ++i) {
-				        size_t columnToBind = bindableParserColumns[i];
-				        char* rest;
-				        const double value = strtod(columnPointers[columnToBind], &rest);
+                    for (size_t i = 0; i < bindableParserColumns.size(); ++i) {
+                        size_t columnToBind = bindableParserColumns[i];
+                        char *rest;
+                        const double value = strtod(columnPointers[columnToBind], &rest);
                         if ((rest == columnPointers[columnToBind]) || errno == ERANGE) {
                             Debug(Debug::WARNING) << "Can not parse column " << columnToBind << "!\n";
                             continue;
                         }
-				        parser->bind(columnToBind, value);
-				    }
-				    const double result = parser->evaluate();
-                    nomatch = (result == 0);
-                } else if (mode == REGEX_FILTERING){
-					nomatch = regexec(&regex, columnValue, 0, NULL, 0);
-                } else if (mode == JOIN_DB){
-                    size_t newId = joinDB->getId(static_cast<unsigned int>(strtoul(columnValue, NULL, 10)));
-                    size_t originalLength = strlen(lineBuffer);
-                    // add tab
-                    lineBuffer[originalLength] = '\t';
-                    originalLength++;
-                    char* fullLine = joinDB->getData(newId, thread_idx);
-                    // either append the full line (default mode):
-                    if (columnToTake == -1) {
-                        size_t fullLineLength = joinDB->getEntryLen(newId);
-                        // Appending join database entry to query database entry
-                        memcpy(lineBuffer + originalLength, fullLine, fullLineLength);
+                        parser->bind(columnToBind, value);
                     }
-                    // or a specified column:
-                    else {
-                        if(*fullLine != '\0'){
-                            std::vector<std::string> splittedLine = Util::split(fullLine, "\t") ;
-                            char* newValue = const_cast<char *>(splittedLine[columnToTake].c_str());
-                            size_t valueLength = joinDB->getEntryLen(newId);
+                    const double result = parser->evaluate();
+                    nomatch = (result == 0);
+                } else if (mode == REGEX_FILTERING) {
+                    nomatch = regexec(&regex, columnValue, 0, NULL, 0);
+                } else if (mode == JOIN_DB) {
+                    size_t newId = helper->getId(static_cast<unsigned int>(strtoul(columnValue, NULL, 10)));
+                    if (newId != UINT_MAX) {
+                        size_t originalLength = strlen(lineBuffer);
+                        // Replace the last \n
+                        lineBuffer[originalLength - 1] = '\t';
+                        char *fullLine = helper->getData(newId, thread_idx);
+                        if (columnToTake == -1) {
+                            // either append the full line (default mode)
+                            size_t fullLineLength = helper->getEntryLen(newId);
+                            // Appending join database entry to query database entry
+                            memcpy(lineBuffer + originalLength, fullLine, fullLineLength);
+                        } else if (*fullLine != '\0') {
+                            // or a specified column
+                            std::vector<std::string> splittedLine = Util::split(fullLine, "\t");
+                            char *newValue = const_cast<char *>(splittedLine[columnToTake].c_str());
+                            size_t valueLength = helper->getEntryLen(newId);
                             // Appending join database entry to query database entry
                             memcpy(lineBuffer + originalLength, newValue, valueLength);
                         }
-                    }
-                }
-                else if (mode == TRANSITIVE_REPLACE) {
-                    std::string singleGene;
-
-                    char *newLineBuffer = new char[LINE_BUFFER_SIZE];
-                    size_t newLineBufferIndex = 0;
-                    char *endLine = lineBuffer + dataLength;
-                    *newLineBuffer = '\0';
-
-                    for (size_t i = 0;i<dataLength;i++)
-                        if (lineBuffer[i] == '\n' || lineBuffer[i] == '\0')
-                        {
-                            endLine = lineBuffer+i;
-                            break;
-                        }
-                    size_t fieldLength = Util::skipNoneWhitespace(columnPointer[column-1]);
-
-                    char *clusterGenes = clusterDB->getDataByDBKey(Util::fast_atoi<unsigned int>(columnValue), thread_idx);
-                    std::stringstream stringstreamClusterGenes(clusterGenes);
-
-
-                    while (std::getline(stringstreamClusterGenes, singleGene)) {
-
-                        if (!singleGene.empty()) {
-
-                            // copy the previous columns
-                            memcpy(newLineBuffer + newLineBufferIndex,lineBuffer,columnPointer[column-1] - columnPointer[0]);
-                            newLineBufferIndex += columnPointer[column-1] - columnPointer[0];
-
-                            // map the current column value
-                            memcpy(newLineBuffer + newLineBufferIndex,singleGene.c_str(),singleGene.length());
-                            newLineBufferIndex += singleGene.length();
-
-
-                            // copy the next columns
-                            if (foundElements > column)
-                            {
-                                memcpy(newLineBuffer + newLineBufferIndex,columnPointer[column-1]+fieldLength,endLine - (columnPointer[column-1]+fieldLength));
-                                newLineBufferIndex += endLine - (columnPointer[column-1]+fieldLength);
-                            } else {
-                                newLineBuffer[newLineBufferIndex++] = '\n';
-                            }
-
-                            if( newLineBuffer[newLineBufferIndex-1] != '\n') {
-                                newLineBuffer[newLineBufferIndex++] = '\n';
-                            }
-
-                            newLineBuffer[newLineBufferIndex] = '\0';
-
-                        }
-
-                    }
-
-                    if(!nomatch)
-                        memcpy(lineBuffer,newLineBuffer,newLineBufferIndex+1);
-                    delete [] newLineBuffer;
-
-                }
-                else if (mode == COMPUTE_POSITIONS) {
-				    // Optimise it
-                    std::vector<std::string> splittedOriginalLine = Util::split(lineBuffer, "\t");
-                    char *lineWithNewFields = swapDB->getDataByDBKey(
-                            static_cast<unsigned int>(strtoul(columnValue, NULL, 10)), thread_idx) ;
-                    std::vector<std::string> splittedLineWithNewFields = Util::split(lineWithNewFields, "\t");
-
-                    unsigned long posStart = std::stoul(splittedOriginalLine[7].c_str()) * 3;
-                    unsigned long posStop = std::stoul(splittedOriginalLine[8].c_str()) * 3;
-
-                    if (posStart < posStop) {
-                        unsigned long startPosHitOnGenome =
-                                std::stoul(splittedLineWithNewFields[7].c_str(), NULL) + posStart;
-                        unsigned long endPosHitOnGenome =
-                                std::stoul(splittedLineWithNewFields[7].c_str(), NULL) + posStop;
-                        splittedOriginalLine.insert(splittedOriginalLine.begin() + 8,
-                                                    std::to_string(startPosHitOnGenome) + "\t");
-                        splittedOriginalLine.insert(splittedOriginalLine.begin() + 10,
-                                                    std::to_string(endPosHitOnGenome) + "\t");
+                        nomatch = 0;
                     } else {
-                        unsigned long startPosHitOnGenome =
-                                std::stoul(splittedLineWithNewFields[7].c_str(), NULL) - posStart;
-                        unsigned long endPosHitOnGenome = std::stoul(splittedLineWithNewFields[7].c_str(), NULL) - posStop;
-                        splittedOriginalLine.insert(splittedOriginalLine.begin() + 8, std::to_string(startPosHitOnGenome) + "\t");
-                        splittedOriginalLine.insert(splittedOriginalLine.begin() + 10, std::to_string(endPosHitOnGenome) + "\t");
-                    }
-                    std::string tempBuffer = "" ;
-                    for (std::vector<std::string>::const_iterator i = splittedOriginalLine.begin(); i != splittedOriginalLine.end(); ++i) {
-                        tempBuffer.append(*i);
-                        tempBuffer.append("\t");
+                        nomatch = 1;
                     }
-                    tempBuffer.append("\n") ;
-                    strcpy(lineBuffer, tempBuffer.c_str()) ;
-                }
-                else if (mode == BEATS_FIRST){
+                } else if (mode == BEATS_FIRST) {
                     if (counter == 1) {
-                        threadCompValue = strtod(columnValue, NULL);
+                        referenceValue = strtod(columnValue, NULL);
                     } else {
                         double toCompare = strtod(columnValue, NULL);
-                        if (compOperator == GREATER_OR_EQUAL) {
-                            nomatch = !(toCompare >= threadCompValue); // keep if the comparison is true
-                        } else if(compOperator == LOWER_OR_EQUAL) {
-                            nomatch = !(toCompare <= threadCompValue); // keep if the comparison is true
-                        } else if(compOperator == EQUAL) {
-                            nomatch = !(toCompare == threadCompValue); // keep if the comparison is true
+                        if (compOperator == OP_GEQ) {
+                            nomatch = !(toCompare >= referenceValue);
+                        } else if (compOperator == OP_LEQ) {
+                            nomatch = !(toCompare <= referenceValue);
+                        } else if (compOperator == OP_EQ) {
+                            nomatch = !(toCompare == referenceValue);
+                        } else if (compOperator == OP_IN_P) {
+                            nomatch = !(toCompare >= (referenceValue * par.compValue));
+                        } else if (compOperator == OP_OUT_P) {
+                            nomatch = !(toCompare <= (referenceValue * par.compValue));
+                        } else if (compOperator == OP_EQ_P) {
+                            nomatch = !(toCompare == referenceValue * par.compValue);
                         } else {
                             nomatch = 0;
                         }
                     }
-                } else {
-                    // i.e. (mode == FILE_FILTERING || mode == FILE_MAPPING)
+                } else if (mode == FILE_FILTERING) {
                     std::string toSearch(columnValue);
-                    if (mode == FILE_FILTERING) {
-                        std::vector<std::string>::iterator foundInFilter = std::upper_bound(filter.begin(),
-                                                                                            filter.end(), toSearch,
-                                                                                            compareString());
-                        if (foundInFilter != filter.end() && toSearch.compare(*foundInFilter) == 0) {
-                            // Found in filter
-                            if (positiveFiltering)
-                                nomatch = 0;
-                            else
-                                nomatch = 1;
+                    std::vector<std::string>::iterator it = std::upper_bound(filter.begin(), filter.end(), toSearch, compareString());
+                    if (it != filter.end() && toSearch.compare(*it) == 0) {
+                        // Found in filter
+                        if (positiveFiltering) {
+                            nomatch = 0;
+                        } else {
+                            nomatch = 1;
+                        }
+                    } else {
+                        // not found in the filter
+                        if (positiveFiltering) {
+                            nomatch = 1;
                         } else {
-                            // not found in the filter
-                            if (positiveFiltering)
-                                nomatch = 1;
-                            else
-                                nomatch = 0;
+                            nomatch = 0;
                         }
-                    } else if (mode == FILE_MAPPING) {
-                        std::vector<std::pair<std::string, std::string>>::iterator foundInFilter = std::lower_bound(
-                                mapping.begin(), mapping.end(), toSearch, compareToFirstString());
+                    }
+                } else if (mode == FILE_MAPPING) {
+                    std::string toSearch(columnValue);
+                    std::vector<std::pair<std::string, std::string>>::iterator it
+                        = std::lower_bound(mapping.begin(), mapping.end(), toSearch, compareToFirstString());
 
-                        // by default, do NOT add to the output
-                        nomatch = 1;
+                    // by default, do NOT add to the output
+                    nomatch = 1;
 
-                        char *newLineBuffer = new char[LINE_BUFFER_SIZE];
-                        size_t newLineBufferIndex = 0;
-                        char *endLine = lineBuffer + dataLength;
-                        *newLineBuffer = '\0';
+                    size_t newLineBufferIndex = 0;
+                    char *endLine = lineBuffer + dataLength;
+                    *newLineBuffer = '\0';
 
-                        for (size_t i = 0; i < dataLength; i++) {
-                            if (lineBuffer[i] == '\n' || lineBuffer[i] == '\0') {
-                                endLine = lineBuffer + i;
-                                break;
-                            }
+                    for (size_t i = 0; i < dataLength; i++) {
+                        if (lineBuffer[i] == '\n' || lineBuffer[i] == '\0') {
+                            endLine = lineBuffer + i;
+                            break;
                         }
-                        size_t fieldLength = Util::skipNoneWhitespace(columnPointer[column - 1]);
+                    }
+                    size_t fieldLength = Util::skipNoneWhitespace(columnPointer[column - 1]);
 
-                        // Output all the possible mapping value
-                        while (foundInFilter != mapping.end() && toSearch.compare(foundInFilter->first) == 0) {
-                            nomatch = 0;
+                    // Output all the possible mapping value
+                    while (it != mapping.end() && toSearch.compare(it->first) == 0) {
+                        nomatch = 0;
 
-                            // copy the previous columns
-                            memcpy(newLineBuffer + newLineBufferIndex, lineBuffer,
-                                   columnPointer[column - 1] - columnPointer[0]);
-                            newLineBufferIndex += columnPointer[column - 1] - columnPointer[0];
-
-                            // map the current column value
-                            memcpy(newLineBuffer + newLineBufferIndex, (foundInFilter->second).c_str(),
-                                   (foundInFilter->second).length());
-                            newLineBufferIndex += (foundInFilter->second).length();
-
-                            // copy the next columns
-                            if (foundElements > column) {
-                                memcpy(newLineBuffer + newLineBufferIndex, columnPointer[column - 1] + fieldLength,
-                                       endLine - (columnPointer[column - 1] + fieldLength));
-                                newLineBufferIndex += endLine - (columnPointer[column - 1] + fieldLength);
-                            } else {
-                                newLineBuffer[newLineBufferIndex++] = '\n';
-                            }
-                            newLineBuffer[newLineBufferIndex] = '\0';
-
-                            foundInFilter++;
-                        }
-                        if (nomatch == 0) {
-                            memcpy(lineBuffer, newLineBuffer, newLineBufferIndex + 1);
+                        // copy the previous columns
+                        memcpy(newLineBuffer + newLineBufferIndex, lineBuffer, columnPointer[column - 1] - columnPointer[0]);
+                        newLineBufferIndex += columnPointer[column - 1] - columnPointer[0];
+
+                        // map the current column value
+                        memcpy(newLineBuffer + newLineBufferIndex, (it->second).c_str(), (it->second).length());
+                        newLineBufferIndex += (it->second).length();
+
+                        // copy the next columns
+                        if (foundElements > column) {
+                            memcpy(newLineBuffer + newLineBufferIndex, columnPointer[column - 1] + fieldLength,
+                                   endLine - (columnPointer[column - 1] + fieldLength));
+                            newLineBufferIndex += endLine - (columnPointer[column - 1] + fieldLength);
+                        } else {
+                            newLineBuffer[newLineBufferIndex++] = '\n';
                         }
-                        delete[] newLineBuffer;
-                    } else if (mode == SORT_ENTRIES) {
-                        toSort.push_back(
-                                std::make_pair<double, std::string>(std::strtod(columnValue, NULL), lineBuffer));
-                        nomatch = 1; // do not put anything in the output buffer
+                        newLineBuffer[newLineBufferIndex] = '\0';
+
+                        ++it;
                     }
-                    else // Unknown filtering mode, keep all entries
-						nomatch = 0;
+                    if (nomatch == 0) {
+                        memcpy(lineBuffer, newLineBuffer, newLineBufferIndex + 1);
+                    }
+                } else if (mode == SORT_ENTRIES) {
+                    toSort.emplace_back(std::strtod(columnValue, NULL), lineBuffer);
+                    // do not put anything in the output buffer
+                    nomatch = 1;
+                } else {
+                    // Unknown filtering mode, keep all entries
+                    nomatch = 0;
+                }
 
-				}
-                
                 if (addSelfMatch) {
                     nomatch = 0;
-                }  
-                    
-				if(!(nomatch)) {
+                }
+
+                if (nomatch == false) {
                     if (trimToOneColumn) {
                         buffer.append(columnValue);
+                    } else {
+                        buffer.append(lineBuffer);
                     }
-                    else {
-						buffer.append(lineBuffer);
-                    }
-                    
+
                     if (buffer.back() != '\n') {
-                        buffer.append("\n");
+                        buffer.append(1, '\n');
                     }
-				}
-				data = Util::skipLine(data);
-			}
-
-            if(mode == SORT_ENTRIES)
-            {
-                if (sortingMode ==INCREASING)
-                    std::stable_sort(toSort.begin(),toSort.end(),compareFirstEntry());
-                else if (sortingMode == DECREASING)
-                    std::stable_sort(toSort.begin(),toSort.end(),compareFirstEntryDecreasing());
-                else if (sortingMode == SHUFFLE)
-                {
-                    srand ( unsigned ( time(0) ) );
-                    std::random_shuffle(toSort.begin(),toSort.end());
+                }
+                data = Util::skipLine(data);
+            }
+
+            if (mode == SORT_ENTRIES) {
+                if (par.sortEntries == INCREASING) {
+                    std::stable_sort(toSort.begin(), toSort.end(), compareFirstEntry());
+                } else if (par.sortEntries == DECREASING) {
+                    std::stable_sort(toSort.begin(), toSort.end(), compareFirstEntryDecreasing());
+                } else if (par.sortEntries == SHUFFLE) {
+                    srand(unsigned(time(0)));
+                    std::random_shuffle(toSort.begin(), toSort.end());
                 }
 
-                
-                for (size_t i = 0; i< toSort.size(); i++)
-                {
-                    buffer.append(toSort[i].second);    
-                    if (buffer.back() != '\n')
-                      buffer.append("\n");
+                for (size_t i = 0; i < toSort.size(); i++) {
+                    buffer.append(toSort[i].second);
+                    if (buffer.back() != '\n') {
+                        buffer.append(1, '\n');
+                    }
                 }
-                
+                toSort.clear();
             }
-            dbw->writeData(buffer.c_str(), buffer.length(), dataDb->getDbKey(id), thread_idx);
-			buffer.clear();
-		}
-		delete [] lineBuffer;
-		delete [] columnValue;
-		delete [] columnPointer;
-	}
-
-	return 0;
-}
+            writer.writeData(buffer.c_str(), buffer.length(), queryKey, thread_idx);
+            buffer.clear();
+        }
 
-int filterdb(int argc, const char **argv, const Command& command) {
-	Parameters& par = Parameters::getInstance();
-    par.parseParameters(argc, argv, command, true, 0, 0);
+        if (parser != NULL) {
+            delete parser;
+        }
+
+        delete[] lineBuffer;
+        delete[] columnValue;
+        delete[] columnPointer;
+        delete[] newLineBuffer;
+    }
+    writer.close();
+    reader.close();
+
+    if (helper != NULL) {
+        helper->close();
+        delete helper;
+    }
+
+    if (mode == REGEX_FILTERING) {
+        regfree(&regex);
+    }
 
-    ffindexFilter filter(par);
-    return filter.runFilter();
+    return EXIT_SUCCESS;
 }
diff --git a/src/util/filterdb.h b/src/util/filterdb.h
deleted file mode 100644
index 460e6ba..0000000
--- a/src/util/filterdb.h
+++ /dev/null
@@ -1,121 +0,0 @@
-#ifndef FILTERDB_H
-#define FILTERDB_H
-
-// Written by Martin Steinegger & Clovis Galiez
-//
-// Filter a ffindex based on a RegEx or a filtering file.
-//
-
-#include <cstddef>
-#include <utility>
-#include <string>
-#include <vector>
-#include <regex.h>
-#include "ExpressionParser.h"
-
-#define REGEX_FILTERING 0
-#define FILE_FILTERING 1
-#define FILE_MAPPING 2
-#define GET_FIRST_LINES 3
-#define NUMERIC_COMPARISON 4
-#define SORT_ENTRIES 5
-#define BEATS_FIRST 6
-#define JOIN_DB 7
-#define COMPUTE_POSITIONS 8
-#define TRANSITIVE_REPLACE 9
-#define EXPRESSION_FILTERING 10
-
-#define GREATER_OR_EQUAL "ge"
-#define LOWER_OR_EQUAL "le"
-#define EQUAL "e"
-
-#define INCREASING  1
-#define DECREASING  2
-#define SHUFFLE     3
-
-class ffindexFilter {
-public:
-
-
-    ffindexFilter(Parameters &par); 
-	~ffindexFilter();
-	
-	int runFilter();
-
-	struct compareString {
-		bool operator() (const std::string& lhs, const std::string& rhs) const{
-			return (lhs.compare(rhs)<=0);
-		}
-	};
-
-	struct compareFirstString {
-		bool operator() (const std::pair<std::string, std::string>& lhs, const std::pair<std::string,std::string>& rhs) const{
-			return (lhs.first.compare(rhs.first)<=0);
-		}
-	};
-
-    struct compareFirstInt {
-        bool operator() (const std::pair<unsigned int, unsigned int>& lhs, const std::pair<unsigned int, unsigned int>& rhs) const{
-            return (lhs.first < rhs.first);
-        }
-    };
-
-	struct compareToFirstString {
-		bool operator() (const std::pair<std::string,std::string>& lhs,const std::string& rhs) const{
-			return (lhs.first.compare(rhs)<0);
-		}
-	};
-
-
-    struct compareToFirstInt {
-        bool operator() (const std::pair<unsigned int, unsigned int>& lhs,const unsigned int rhs) const{
-            return (lhs.first <= rhs);
-        }
-    };
-
-	static bool compareToFirstInt(const std::pair<unsigned int, unsigned int>& lhs, const std::pair<unsigned int, unsigned int>&  rhs){
-		return (lhs.first <= rhs.first);
-	}
-
-
-private:
-	std::string inDB;
-	std::string outDB;
-    std::string filterFile;
-
-    int sortingMode;
-    
-	int threads;
-	int compressed;
-
-	size_t column;
-	int columnToTake;
-    std::string regexStr;
-    bool trimToOneColumn;
-    // positiveFilter = true => outDB = inDB \intersect filter ; othw : outDB = inDB - filter
-    bool positiveFiltering;
-    int numberOfLines;
-    int mode;
-	double compValue;
-	std::string compOperator;
-    bool shouldAddSelfMatch;
-    ExpressionParser* parser;
-    std::vector<int> bindableParserColumns;
-
-    DBWriter* dbw;
-	DBReader<unsigned int>* dataDb;
-	DBReader<unsigned int>* joinDB;
-    DBReader<unsigned int>* swapDB;
-    DBReader<unsigned int>* clusterDB;
-	
-	regex_t regex;
-	std::vector<std::string> filter;
-
-	std::vector<std::pair<std::string,std::string>> mapping;
-	
-	int initFiles();
-	
-
-};
-
-#endif
diff --git a/src/util/indexdb.cpp b/src/util/indexdb.cpp
index 154d079..d7bcf97 100644
--- a/src/util/indexdb.cpp
+++ b/src/util/indexdb.cpp
@@ -21,7 +21,7 @@ std::string findIncompatibleParameter(DBReader<unsigned int>& index, const Param
         return "maxSeqLen";
     if (meta.seqType != dbtype)
         return "seqType";
-    if (Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_NUCLEOTIDES) == false && par.searchType != Parameters::SEARCH_TYPE_NUCLEOTIDES && meta.alphabetSize != par.alphabetSize)
+    if (Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_NUCLEOTIDES) == false && par.searchType != Parameters::SEARCH_TYPE_NUCLEOTIDES && meta.alphabetSize != par.alphabetSize.aminoacids)
         return "alphabetSize";
     if (meta.kmerSize != par.kmerSize)
         return "kmerSize";
diff --git a/src/util/maskbygff.cpp b/src/util/maskbygff.cpp
index 295343b..23c5308 100644
--- a/src/util/maskbygff.cpp
+++ b/src/util/maskbygff.cpp
@@ -69,7 +69,7 @@ int maskbygff(int argc, const char **argv, const Command& command) {
 
         size_t id = reader.getId(name);
         if(id == UINT_MAX) {
-            Debug(Debug::ERROR) << "GFF entry not found in fasta ffindex: " << name << "!\n";
+            Debug(Debug::ERROR) << "GFF entry not found in input database: " << name << "!\n";
             return EXIT_FAILURE;
         }
 
diff --git a/src/util/masksequence.cpp b/src/util/masksequence.cpp
index 828d56f..935a9f3 100644
--- a/src/util/masksequence.cpp
+++ b/src/util/masksequence.cpp
@@ -1,6 +1,3 @@
-#include <string>
-#include <fstream>
-#include <climits>
 #include "NucleotideMatrix.h"
 #include "SubstitutionMatrix.h"
 #include "tantan.h"
@@ -10,7 +7,6 @@
 #include "Util.h"
 #include "FileUtil.h"
 
-
 #ifdef OPENMP
 #include <omp.h>
 #endif
@@ -54,7 +50,7 @@ int masksequence(int argc, const char **argv, const Command& command) {
             char *seqData = reader.getData(id, thread_idx);
             unsigned int seqLen = 0;
             while (seqData[seqLen] != '\0') {
-                charSequence[seqLen] = (char) subMat->aa2int[(int) seqData[seqLen]];
+                charSequence[seqLen] = (char) subMat->aa2num[static_cast<int>(seqData[seqLen])];
                 seqLen++;
             }
             tantan::maskSequences(charSequence,
diff --git a/src/util/mergeclusters.cpp b/src/util/mergeclusters.cpp
index 2d9067c..822d9b8 100644
--- a/src/util/mergeclusters.cpp
+++ b/src/util/mergeclusters.cpp
@@ -128,7 +128,7 @@ int mergeclusters(int argc, const char **argv, const Command &command) {
             progress.updateProgress();
 
             // no cluster for this representative
-            if (mergedClustering[i].size() == 0)
+            if (mergedClustering[i].empty())
                 continue;
 
             // representative
diff --git a/src/util/msa2profile.cpp b/src/util/msa2profile.cpp
index 9a24539..f95f6b3 100644
--- a/src/util/msa2profile.cpp
+++ b/src/util/msa2profile.cpp
@@ -45,7 +45,12 @@ int msa2profile(int argc, const char **argv, const Command &command) {
         sequenceReader->open(DBReader<unsigned int>::SORT_BY_LINE);
     }
 
-    DBReader<unsigned int> qDbr(msaData.c_str(), msaIndex.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
+    unsigned int mode = DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA;
+    std::string lookupFile = msaData + ".lookup";
+    if (FileUtil::fileExists(lookupFile.c_str())) {
+        mode |= DBReader<unsigned int>::USE_LOOKUP;
+    }
+    DBReader<unsigned int> qDbr(msaData.c_str(), msaIndex.c_str(), par.threads, mode);
     qDbr.open(DBReader<unsigned int>::LINEAR_ACCCESS);
 
     Debug(Debug::INFO) << "Finding maximum sequence length and set size.\n";
@@ -140,7 +145,7 @@ int msa2profile(int argc, const char **argv, const Command &command) {
 
         const bool maskByFirst = par.matchMode == 0;
         const float matchRatio = par.matchRatio;
-        MsaFilter filter(maxSeqLength + 1, maxSetSize, &subMat, par.gapOpen, par.gapExtend);
+        MsaFilter filter(maxSeqLength + 1, maxSetSize, &subMat, par.gapOpen.aminoacids, par.gapExtend.aminoacids);
 
 #pragma omp for schedule(dynamic, 1)
         for (size_t id = 0; id < qDbr.getSize(); ++id) {
@@ -207,7 +212,6 @@ int msa2profile(int argc, const char **argv, const Command &command) {
                 // first sequence is always the query
                 if (setSize == 0) {
                     centerLengthWithGaps = static_cast<unsigned int>(strlen(seq->seq.s));
-
                     if (maskByFirst == true) {
                         for (size_t i = 0; i < centerLengthWithGaps; ++i) {
                             if (seq->seq.s[i] == '-') {
@@ -218,14 +222,15 @@ int msa2profile(int argc, const char **argv, const Command &command) {
                             }
                         }
                     }
-
-                    std::string header(seq->name.s);
-                    if (seq->comment.l > 0) {
-                        header.append(" ");
-                        header.append(seq->comment.s);
+                    if ((mode & DBReader<unsigned int>::USE_LOOKUP) == 0) {
+                        std::string header(seq->name.s);
+                        if (seq->comment.l > 0) {
+                            header.append(" ");
+                            header.append(seq->comment.s);
+                        }
+                        header.append("\n");
+                        headerWriter.writeData(header.c_str(), header.size(), queryKey, thread_idx);
                     }
-                    header.append("\n");
-                    headerWriter.writeData(header.c_str(), header.size(), queryKey, thread_idx);
                 }
 
                 sequence.mapSequence(0, 0, seq->seq.s, seq->seq.l);
@@ -244,7 +249,7 @@ int msa2profile(int argc, const char **argv, const Command &command) {
                     if (seq->seq.s[i] == '-'){
                         msaContent[msaPos++] = MultipleAlignment::GAP;
                     } else {
-                        int aa = sequence.int_sequence[i];
+                        int aa = sequence.numSequence[i];
                         msaContent[msaPos++] = aa;
                     }
                 }
@@ -318,11 +323,10 @@ int msa2profile(int argc, const char **argv, const Command &command) {
 
             size_t filteredSetSize = setSize;
             if (par.filterMsa == 1) {
-                filter.filter(setSize, centerLength, static_cast<int>(par.covMSAThr * 100),
+                filteredSetSize = filter.filter(setSize, centerLength, static_cast<int>(par.covMSAThr * 100),
                               static_cast<int>(par.qid * 100), par.qsc,
                               static_cast<int>(par.filterMaxSeqId * 100), par.Ndiff,
-                              (const char **) msaSequences, &filteredSetSize);
-                filter.shuffleSequences((const char **) msaSequences, setSize);
+                              (const char **) msaSequences);
             }
 
             PSSMCalculator::Profile pssmRes =
@@ -334,10 +338,16 @@ int msa2profile(int argc, const char **argv, const Command &command) {
                 }
                 // write query, consensus sequence and neffM
                 result.push_back(static_cast<unsigned char>(msaSequences[0][pos]));
-                result.push_back(static_cast<unsigned char>(subMat.aa2int[static_cast<int>(pssmRes.consensus[pos])]));
+                result.push_back(subMat.aa2num[static_cast<int>(pssmRes.consensus[pos])]);
                 result += MathUtil::convertNeffToChar(pssmRes.neffM[pos]);
             }
 
+            if (mode & DBReader<unsigned int>::USE_LOOKUP) {
+                size_t lookupId = qDbr.getLookupIdByKey(queryKey);
+                std::string header = qDbr.getLookupEntryName(lookupId);
+                header.append(1, '\n');
+                headerWriter.writeData(header.c_str(), header.length(), queryKey, thread_idx);
+            }
             resultWriter.writeData(result.c_str(), result.length(), queryKey, thread_idx);
             result.clear();
         }
@@ -352,6 +362,8 @@ int msa2profile(int argc, const char **argv, const Command &command) {
     resultWriter.close(true);
     qDbr.close();
 
+    DBReader<unsigned int>::softlinkDb(par.db1, par.db2, (DBFiles::Files)(DBFiles::LOOKUP | DBFiles::SOURCE));
+
     if (sequenceReader != NULL) {
         sequenceReader->close();
         delete sequenceReader;
diff --git a/src/util/offsetalignment.cpp b/src/util/offsetalignment.cpp
index a6ac056..dac0495 100644
--- a/src/util/offsetalignment.cpp
+++ b/src/util/offsetalignment.cpp
@@ -214,6 +214,7 @@ int offsetalignment(int argc, const char **argv, const Command &command) {
     bool isSameSrcDB = (par.db3.compare(par.db1) == 0);
     bool isNuclNuclSearch = false;
     bool isTransNucTransNucSearch = false;
+    bool isTransNuclAln = false;
     if (targetNucl) {
         bool seqtargetNuc = true;
         if(isSameSrcDB){
@@ -236,6 +237,10 @@ int offsetalignment(int argc, const char **argv, const Command &command) {
             } else if(par.searchType == Parameters::SEARCH_TYPE_NUCLEOTIDES){
                 seqtargetNuc = true;
                 isTransNucTransNucSearch = false;
+            } else if(par.searchType == Parameters::SEARCH_TYPE_TRANS_NUCL_ALN){
+                isTransNuclAln = true;
+                seqtargetNuc = false;
+                isTransNucTransNucSearch = true;
             }
         }
 
@@ -392,7 +397,7 @@ int offsetalignment(int argc, const char **argv, const Command &command) {
                         for(size_t i = 0; i < results.size(); i++) {
                             Matcher::result_t &res = results[i];
                             bool hasBacktrace = (res.backtrace.size() > 0);
-                            if (isNuclNuclSearch == false && hasBacktrace) {
+                            if (isTransNuclAln == true && isNuclNuclSearch == false && isTransNucTransNucSearch == true && hasBacktrace) {
                                 newBacktrace.reserve(res.backtrace.length() * 3);
                                 Matcher::result_t::protein2nucl(res.backtrace, newBacktrace);
                                 res.backtrace = newBacktrace;
@@ -423,7 +428,7 @@ int offsetalignment(int argc, const char **argv, const Command &command) {
                     for(size_t i = 0; i < results.size(); i++){
                         Matcher::result_t &res = results[i];
                         bool hasBacktrace = (res.backtrace.size() > 0);
-                        if (isNuclNuclSearch == false && hasBacktrace) {
+                        if (isTransNuclAln == true && isNuclNuclSearch == false && isTransNucTransNucSearch == true && hasBacktrace) {
                             newBacktrace.reserve(res.backtrace.length() * 3);
                             Matcher::result_t::protein2nucl(res.backtrace, newBacktrace);
                             res.backtrace = newBacktrace;
@@ -438,7 +443,7 @@ int offsetalignment(int argc, const char **argv, const Command &command) {
                     for(size_t i = 0; i < tmp.size(); i++){
                         Matcher::result_t &res = tmp[i];
                         bool hasBacktrace = (res.backtrace.size() > 0);
-                        if (isNuclNuclSearch == false && hasBacktrace) {
+                        if (isTransNuclAln == true && isNuclNuclSearch == false && isTransNucTransNucSearch == true && hasBacktrace) {
                             newBacktrace.reserve(res.backtrace.length() * 3);
                             Matcher::result_t::protein2nucl(res.backtrace, newBacktrace);
                             res.backtrace = newBacktrace;
diff --git a/src/util/orftocontig.cpp b/src/util/orftocontig.cpp
index 2309750..7b2e5e0 100644
--- a/src/util/orftocontig.cpp
+++ b/src/util/orftocontig.cpp
@@ -33,7 +33,7 @@ int orftocontig(int argn, const char **argv, const Command& command) {
 #ifdef OPENMP
         thread_idx = static_cast<unsigned int>(omp_get_thread_num());
 #endif 
-        char orfToContigBuffer[LINE_MAX];
+        char orfToContigBuffer[1024];
         
 #pragma omp for schedule(dynamic, 100)
         for (size_t id = 0; id < orfHeadersReader.getSize(); ++id) {
diff --git a/src/util/profile2cs.cpp b/src/util/profile2cs.cpp
index 460cbd5..10338ff 100644
--- a/src/util/profile2cs.cpp
+++ b/src/util/profile2cs.cpp
@@ -68,12 +68,12 @@ int profile2cs(int argc, const char **argv, const Command &command) {
                 // DEBUG: in case of pure state library, check when the wrong pure state has been chosen
                 /*for (size_t k = 0; k<result.size();k++)
                 {
-                    if (subMat.subMatrix[ProfileStates::hh2mmseqsAAorder((int)result[k])][seq.int_consensus_sequence[k]]<0)
+                    if (subMat.subMatrix[ProfileStates::hh2mmseqsAAorder((int)result[k])][seq.consensus_sequence[k]]<0)
                     {
-                        std::cout<<"Pos: "<<k<<", "<<subMat.int2aa[ProfileStates::hh2mmseqsAAorder((int)result[k])]<<"-"<<subMat.int2aa[seq.int_consensus_sequence[k]]<<"("<<subMat.subMatrix[ProfileStates::hh2mmseqsAAorder((int)result[k])][seq.int_consensus_sequence[k]]<<") \n";
+                        std::cout<<"Pos: "<<k<<", "<<subMat.num2aa[ProfileStates::hh2mmseqsAAorder((int)result[k])]<<"-"<<subMat.num2aa[seq.consensus_sequence[k]]<<"("<<subMat.subMatrix[ProfileStates::hh2mmseqsAAorder((int)result[k])][seq.consensus_sequence[k]]<<") \n";
                         for (size_t aa = 0; aa<20;aa++)
                         {
-                            std::cout<<subMat.int2aa[aa]<<"\t";
+                            std::cout<<subMat.num2aa[aa]<<"\t";
                         }
                         std::cout<<"\n";
                         for (size_t aa = 0; aa<20;aa++)
diff --git a/src/util/profile2pssm.cpp b/src/util/profile2pssm.cpp
index 77bdbcb..023c391 100644
--- a/src/util/profile2pssm.cpp
+++ b/src/util/profile2pssm.cpp
@@ -59,7 +59,7 @@ int profile2pssm(int argc, const char **argv, const Command &command) {
             result.append("Pos\tCns");
             for (size_t aa = 0; aa < Sequence::PROFILE_AA_SIZE; aa++) {
                 result.push_back('\t');
-                result.push_back(subMat.int2aa[aa]);
+                result.push_back(subMat.num2aa[aa]);
             }
             result.push_back('\n');
 
@@ -67,7 +67,7 @@ int profile2pssm(int argc, const char **argv, const Command &command) {
                 Itoa::i32toa_sse2(j, buffer);
                 result.append(buffer);
                 result.push_back('\t');
-                result.push_back(subMat.int2aa[seq.int_consensus_sequence[j]]);
+                result.push_back(subMat.num2aa[seq.numConsensusSequence[j]]);
                 for (size_t aa = 0; aa < Sequence::PROFILE_AA_SIZE; aa++) {
                     result.push_back('\t');
                     Itoa::i32toa_sse2(seq.profile_for_alignment[aa * seq.L + j], buffer);
diff --git a/src/util/profile2seq.cpp b/src/util/profile2seq.cpp
index ae78448..9cbdc4c 100644
--- a/src/util/profile2seq.cpp
+++ b/src/util/profile2seq.cpp
@@ -37,9 +37,9 @@ int profile2seq(int argc, const char **argv, const Command &command, bool consen
         for (size_t i = 0; i < entries; ++i) {
             progress.updateProgress();
             seq.mapProfile(reader.getData(i, thread_idx), false, reader.getSeqLen(i));
-            int* sequence = consensus ? seq.int_consensus_sequence : seq.int_sequence;
+            unsigned char * sequence = consensus ? seq.numConsensusSequence : seq.numSequence;
             for (int aa = 0; aa < seq.L; aa++) {
-                result.append(1, subMat.int2aa[sequence[aa]]);
+                result.append(1, subMat.num2aa[sequence[aa]]);
             }
             result.append(1, '\n');
             writer.writeData(result.c_str(), result.length(), reader.getDbKey(i), thread_idx);
diff --git a/src/util/result2dnamsa.cpp b/src/util/result2dnamsa.cpp
new file mode 100644
index 0000000..b513940
--- /dev/null
+++ b/src/util/result2dnamsa.cpp
@@ -0,0 +1,154 @@
+// Computes MSAs from clustering or alignment result
+
+#include <string>
+#include <vector>
+#include <sstream>
+#include <Matcher.h>
+#include <Orf.h>
+
+#include "Parameters.h"
+#include "DBReader.h"
+#include "DBWriter.h"
+#include "Debug.h"
+#include "Util.h"
+
+#ifdef OPENMP
+#include <omp.h>
+#endif
+
+int result2dnamsa(int argc, const char **argv, const Command &command) {
+    Parameters &par = Parameters::getInstance();
+    par.parseParameters(argc, argv, command, true, 0, 0);
+
+    DBReader<unsigned int> qDbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
+    qDbr.open(DBReader<unsigned int>::NOSORT);
+
+    DBReader<unsigned int> queryHeaderReader(par.hdr1.c_str(), par.hdr1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
+    // NOSORT because the index should be in the same order as resultReader
+    queryHeaderReader.open(DBReader<unsigned int>::NOSORT);
+
+    DBReader<unsigned int> *tDbr = &qDbr;
+    DBReader<unsigned int> *tempateHeaderReader = &queryHeaderReader;
+
+    const bool sameDatabase = (par.db1.compare(par.db2) == 0) ? true : false;
+    if (!sameDatabase) {
+        tDbr = new DBReader<unsigned int>(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
+        tDbr->open(DBReader<unsigned int>::NOSORT);
+
+        tempateHeaderReader = new DBReader<unsigned int>(par.hdr2.c_str(), par.hdr2Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
+        tempateHeaderReader->open(DBReader<unsigned int>::NOSORT);
+    }
+
+    DBReader<unsigned int> resultReader(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
+    resultReader.open(DBReader<unsigned int>::LINEAR_ACCCESS);
+
+    DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), par.threads, par.compressed, Parameters::DBTYPE_MSA_DB);
+    resultWriter.open();
+
+    Debug(Debug::INFO) << "Query database size: "  << qDbr.getSize() << " type: " << qDbr.getDbTypeName() << "\n";
+    Debug(Debug::INFO) << "Target database size: " << tDbr->getSize() << " type: " << tDbr->getDbTypeName() << "\n";
+
+    Debug::Progress progress(resultReader.getSize());
+
+#pragma omp parallel
+    {
+        unsigned int thread_idx = 0;
+#ifdef OPENMP
+        thread_idx = (unsigned int) omp_get_thread_num();
+#endif
+        std::vector<Matcher::result_t> alnResults;
+        std::string out;
+
+#pragma omp  for schedule(dynamic, 10)
+        for (size_t id = 0; id < resultReader.getSize(); id++) {
+            progress.updateProgress();
+            alnResults.clear();
+            // Get the sequence from the queryDB
+            unsigned int queryKey = resultReader.getDbKey(id);
+            size_t queryId = qDbr.getId(queryKey);
+            resultWriter.writeStart(thread_idx);
+
+            if (par.skipQuery == false) {
+                char *centerSequenceHeader = queryHeaderReader.getData(queryId, thread_idx);
+                resultWriter.writeAdd(">", 1, thread_idx);
+                resultWriter.writeAdd(centerSequenceHeader, queryHeaderReader.getSeqLen(queryId)+1, thread_idx);
+                char *seq = qDbr.getData(queryId, 0);
+                resultWriter.writeAdd(seq, qDbr.getSeqLen(queryId)+1, thread_idx);
+            }
+            Matcher::readAlignmentResults(alnResults, resultReader.getData(id, thread_idx), false);
+            for (size_t i = 0; i < alnResults.size(); i++) {
+                Matcher::result_t res = alnResults[i];
+                bool queryIsReversed = (res.qStartPos > res.qEndPos);
+                const size_t targetId = tDbr->getId(res.dbKey);
+                out.clear();
+                char *templateHeader = tempateHeaderReader->getData(targetId, thread_idx);
+                resultWriter.writeAdd(">", 1, thread_idx);
+                resultWriter.writeAdd(templateHeader, tempateHeaderReader->getSeqLen(targetId) + 1, thread_idx);
+                char *targetSeq = tDbr->getData(targetId, thread_idx);
+                unsigned int seqPos = 0;
+                bool targetIsReversed = (res.dbStartPos > res.dbEndPos);
+
+                bool isReverseStrand = false;
+                if(queryIsReversed == true && targetIsReversed == true) {
+                    std::swap(res.dbStartPos, res.dbEndPos);
+                    std::reverse(res.backtrace.begin(), res.backtrace.end());
+                } else if(queryIsReversed == true && targetIsReversed == false){
+                    isReverseStrand = true;
+                    std::swap(res.dbStartPos, res.dbEndPos);
+                    std::reverse(res.backtrace.begin(), res.backtrace.end());
+                }else if(queryIsReversed == false && targetIsReversed == true){
+                    isReverseStrand = true;
+                }
+
+
+                int qStartPos = std::min(res.qStartPos, res.qEndPos);
+                for (int pos = 0; pos < qStartPos; ++pos) {
+                    out.push_back('-');
+                }
+                for (uint32_t pos = 0; pos < res.backtrace.size(); ++pos) {
+                    char seqChar = (isReverseStrand == true) ? Orf::complement(targetSeq[res.dbStartPos - seqPos])
+                                                             : targetSeq[res.dbStartPos + seqPos];
+                    switch (res.backtrace[pos]) {
+                        case 'M':
+                            out.push_back(seqChar);
+                            seqPos++;
+                            break;
+                        case 'I':
+                            out.push_back('-');
+
+                            break;
+                        case 'D':
+                            seqPos++;
+//                                out.append(1, seqChar);
+
+                            break;
+                    }
+                }
+                int qEndPos = std::max(res.qStartPos, res.qEndPos);
+                for (unsigned int pos = qEndPos+1; pos < res.qLen; ++pos) {
+                    out.push_back('-');
+                }
+                out.push_back('\n');
+                resultWriter.writeAdd(out.c_str(), out.size(), thread_idx);
+            }
+            resultWriter.writeEnd(queryKey, thread_idx);
+        }
+    }
+
+    // cleanup
+    resultWriter.close(true);
+    resultReader.close();
+    queryHeaderReader.close();
+    qDbr.close();
+
+    if (!sameDatabase) {
+        tempateHeaderReader->close();
+        delete tempateHeaderReader;
+        tDbr->close();
+        delete tDbr;
+    }
+
+
+
+    return EXIT_SUCCESS;
+}
diff --git a/src/util/result2flat.cpp b/src/util/result2flat.cpp
index e553a8e..322ef61 100644
--- a/src/util/result2flat.cpp
+++ b/src/util/result2flat.cpp
@@ -26,7 +26,6 @@ int result2flat(int argc, const char **argv, const Command &command) {
     char header_start[] = {'>'};
     char newline[] = {'\n'};
 
-    char *dbKeyBuffer = new char[par.maxSeqLen * 20];
     for (size_t i = 0; i < dbr_data.getSize(); i++) {
 
         // Write the header, taken from the original queryDB
@@ -52,21 +51,31 @@ int result2flat(int argc, const char **argv, const Command &command) {
 
         // write data
         char *data = dbr_data.getData(i, 0);
+        std::string dbKeyBuffer;
+        const char * words[2];
         while (*data != '\0') {
             // dbKeyBuffer can contain sequence
-            Util::parseKey(data, dbKeyBuffer);
-            const unsigned int dbKey = (unsigned int) strtoul(dbKeyBuffer, NULL, 10);
-            char *header_data = targetdb_header.getDataByDBKey(dbKey, 0);
+            Util::getWordsOfLine(data, words, 2);
+            //Util::parseKey(data, dbKeyBuffer);
+            char *target_header_data = NULL;
+            size_t keyLen = 0;
+            for(size_t  i = 0; i < DbValidator::resultDb.size(); i++){
+                if(Parameters::isEqualDbtype(dbr_data.getDbtype(), DbValidator::resultDb[i])  ) {
+                    keyLen = (words[1] - words[0]);
+                    dbKeyBuffer.size();
+                    dbKeyBuffer.append(words[0], keyLen);
+                    const unsigned int dbKey = (unsigned int) strtoul(dbKeyBuffer.c_str(), NULL, 10);
+                    target_header_data = targetdb_header.getDataByDBKey(dbKey, 0);
+                }
+            }
             std::string dataStr;
-            if (par.useHeader == true && header_data != NULL && dbr_data.getDbtype() == -1)
+            if (par.useHeader == true && target_header_data != NULL)
             {
-                dataStr = Util::parseFastaHeader(header_data);
+                dataStr = Util::parseFastaHeader(target_header_data);
                 char *endLenData = Util::skipLine(data);
-                size_t keyLen = strlen(dbKeyBuffer);
                 char *dataWithoutKey = data + keyLen;
                 size_t dataToCopySize = endLenData - dataWithoutKey;
-                std::string data(dataWithoutKey, dataToCopySize);
-                dataStr.append(data);
+                dataStr.append(dataWithoutKey, dataToCopySize);
             } else {
                 char *startLine = data;
                 char *endLine = Util::skipLine(data);
@@ -87,7 +96,6 @@ int result2flat(int argc, const char **argv, const Command &command) {
             data = Util::skipLine(data);
         }
     }
-    delete[] dbKeyBuffer;
 
 
     fclose(fastaFP);
diff --git a/src/util/result2msa.cpp b/src/util/result2msa.cpp
index 76f744c..f71b668 100644
--- a/src/util/result2msa.cpp
+++ b/src/util/result2msa.cpp
@@ -80,7 +80,7 @@ int result2msa(Parameters &par, const std::string &resultData, const std::string
 
     Debug(Debug::INFO) << "Start computing "
                        << (par.compressMSA ? "compressed" : "") << " multiple sequence alignments.\n";
-    EvalueComputation evalueComputation(tDbr->getAminoAcidDBSize(), &subMat, par.gapOpen, par.gapExtend);
+    EvalueComputation evalueComputation(tDbr->getAminoAcidDBSize(), &subMat, par.gapOpen.aminoacids, par.gapExtend.aminoacids);
     if (qDbr.getDbtype() == -1 || tDbr->getDbtype() == -1) {
         Debug(Debug::ERROR) << "Please recreate your database or add a .dbtype file to your sequence/profile database.\n";
         EXIT(EXIT_FAILURE);
@@ -102,10 +102,10 @@ int result2msa(Parameters &par, const std::string &resultData, const std::string
         thread_idx = (unsigned int) omp_get_thread_num();
 #endif
 
-        Matcher matcher(qDbr.getDbtype(), maxSequenceLength, &subMat, &evalueComputation, par.compBiasCorrection, par.gapOpen, par.gapExtend);
+        Matcher matcher(qDbr.getDbtype(), maxSequenceLength, &subMat, &evalueComputation, par.compBiasCorrection, par.gapOpen.aminoacids, par.gapExtend.aminoacids);
         MultipleAlignment aligner(maxSequenceLength, maxSetSize, &subMat, &matcher);
         PSSMCalculator calculator(&subMat, maxSequenceLength, maxSetSize, par.pca, par.pcb);
-        MsaFilter filter(maxSequenceLength, maxSetSize, &subMat, par.gapOpen, par.gapExtend);
+        MsaFilter filter(maxSequenceLength, maxSetSize, &subMat, par.gapOpen.aminoacids, par.gapExtend.aminoacids);
         UniprotHeaderSummarizer summarizer;
         Sequence centerSequence(maxSequenceLength, qDbr.getDbtype(), &subMat, 0, false, par.compBiasCorrection);
 
@@ -133,14 +133,14 @@ int result2msa(Parameters &par, const std::string &resultData, const std::string
             // TODO: Do we still need this?
             if (centerSequence.L)
             {
-                if(centerSequence.int_sequence[centerSequence.L-1] == 20) // remove last in it is a *
+                if(centerSequence.numSequence[centerSequence.L-1] == 20) // remove last in it is a *
                 {
                     centerSequence.L--;
                 }
             }
             char *centerSequenceHeader = queryHeaderReader.getDataByDBKey(queryKey, 0);
 
-            char *results = resultReader.getData(id, 0);
+            char *results = resultReader.getData(id, thread_idx);
             std::vector<Matcher::result_t> alnResults;
             std::vector<Sequence *> seqSet;
             while (*results != '\0') {
@@ -188,10 +188,9 @@ int result2msa(Parameters &par, const std::string &resultData, const std::string
             alnResults = res.alignmentResults;
             size_t filteredSetSize = res.setSize;
             if (isFiltering) {
-                filter.filter(res.setSize, res.centerLength, static_cast<int>(par.covMSAThr * 100),
+                filteredSetSize = filter.filter(res, static_cast<int>(par.covMSAThr * 100),
                               static_cast<int>(par.qid * 100), par.qsc,
-                              static_cast<int>(par.filterMaxSeqId * 100), par.Ndiff,
-                              (const char **) res.msaSequence, &filteredSetSize);
+                              static_cast<int>(par.filterMaxSeqId * 100), par.Ndiff);
                 filter.getKept(kept, res.setSize);
             }
 
@@ -240,7 +239,7 @@ int result2msa(Parameters &par, const std::string &resultData, const std::string
                     // need to allow insertion in the centerSequence
                     for (size_t pos = 0; pos < res.centerLength; pos++) {
                         char aa = res.msaSequence[i][pos];
-                        msa << ((aa < MultipleAlignment::NAA) ? subMat.int2aa[(int) aa] : '-');
+                        msa << ((aa < MultipleAlignment::NAA) ? subMat.num2aa[(int) aa] : '-');
                     }
 
                     msa << "\n";
@@ -260,10 +259,6 @@ int result2msa(Parameters &par, const std::string &resultData, const std::string
 
                 std::ostringstream msa;
                 if (par.omitConsensus == false) {
-                    if (isFiltering) {
-                        filter.shuffleSequences((const char **) res.msaSequence, res.setSize);
-                    }
-
                     for (size_t pos = 0; pos < res.centerLength; pos++) {
                         if (res.msaSequence[0][pos] == MultipleAlignment::GAP) {
                             Debug(Debug::ERROR) <<  "Error in computePSSMFromMSA. First sequence of MSA is not allowed to contain gaps.\n";
@@ -279,7 +274,7 @@ int result2msa(Parameters &par, const std::string &resultData, const std::string
                     std::ostringstream centerSeqStr;
                     // Retrieve the master sequence
                     for (int pos = 0; pos < centerSequence.L; pos++) {
-                        centerSeqStr << subMat.int2aa[centerSequence.int_sequence[pos]];
+                        centerSeqStr << subMat.num2aa[centerSequence.numSequence[pos]];
                     }
                     msa << ">" << queryHeaderReader.getDataByDBKey(queryKey,  thread_idx) << centerSeqStr.str() << "\n;";
                 }
@@ -337,7 +332,6 @@ int result2msa(Parameters &par) {
         // Use only 1 thread for concat to ensure the same order as the later header concat
         referenceDBr = new DBConcat(par.db1, par.db1Index, par.db2, par.db2Index,
                                     referenceSeqName, referenceSeqIndexName, 1);
-        referenceDBr->concat();
         // When exporting in ca3m,
         // we need to have an access in SORT_BY_LINE
         // mode in order to keep track of the original
@@ -353,7 +347,6 @@ int result2msa(Parameters &par) {
         // Use only 1 thread for concat to ensure the same order as the former sequence concat
         DBConcat referenceHeadersDBr(par.hdr1, par.hdr1Index, par.hdr2, par.hdr2Index,
                                      referenceHeadersName, referenceHeadersIndexName, 1);
-        referenceHeadersDBr.concat();
 
         outDb.append("_ca3m.ffdata");
         outIndex = par.db4;
@@ -393,8 +386,7 @@ int result2msa(Parameters &par, const unsigned int mpiRank, const unsigned int m
 
         // Use only 1 thread for concat to ensure the same order as the later header concat
         referenceDBr = new DBConcat(par.db1, par.db1Index, par.db2, par.db2Index,
-                                    referenceSeqName, referenceSeqIndexName, 1);
-        referenceDBr->concat(MMseqsMPI::isMaster());
+                                    referenceSeqName, referenceSeqIndexName, 1, MMseqsMPI::isMaster());
 
 #ifdef HAVE_MPI
         MPI_Barrier(MPI_COMM_WORLD);
@@ -415,7 +407,6 @@ int result2msa(Parameters &par, const unsigned int mpiRank, const unsigned int m
             // Use only 1 thread for concat to ensure the same order as the former sequence concat
             DBConcat referenceHeadersDBr(par.hdr1, par.hdr1Index, par.hdr2, par.hdr2Index,
                                          referenceHeadersName, referenceHeadersIndexName, 1);
-            referenceHeadersDBr.concat();
         }
 
         outDb.append("_ca3m.ffdata");
@@ -444,7 +435,6 @@ int result2msa(Parameters &par, const unsigned int mpiRank, const unsigned int m
             splitFiles.push_back(std::make_pair(tmpFile.first, tmpFile.second));
 
         }
-        // merge output ffindex databases
         DBWriter::mergeResults(outDb, outIndex, splitFiles, par.compressMSA);
     }
 
diff --git a/src/util/result2pp.cpp b/src/util/result2pp.cpp
index e33a495..1218936 100644
--- a/src/util/result2pp.cpp
+++ b/src/util/result2pp.cpp
@@ -7,7 +7,6 @@
 #include "PSSMCalculator.h"
 #include "DBWriter.h"
 #include "DBReader.h"
-#include "DBConcat.h"
 #include "HeaderSummarizer.h"
 #include "CompressedA3M.h"
 #include "Debug.h"
@@ -17,7 +16,6 @@
 #include "SubstitutionMatrix.h"
 #include <string>
 #include <vector>
-#include <sstream>
 
 #ifdef OPENMP
 #include <omp.h>
@@ -240,7 +238,7 @@ int computeProfileProfile(Parameters &par,const std::string &outpath,
                 //std::cout<<std::endl;
                 
                 // write query, consensus sequence and neffM
-                result.push_back(static_cast<unsigned char>(queryProfile.int_sequence[l]));
+                result.push_back(queryProfile.numSequence[l]);
                 result.push_back(consensus[l]);
                 unsigned char neff = MathUtil::convertNeffToChar(neffM[l]);
                 result.push_back(neff);
@@ -315,7 +313,6 @@ int computeProfileProfile(Parameters &par,const unsigned int mpiRank, const unsi
             splitFiles.push_back(std::make_pair(tmpFile.first ,  tmpFile.first + ".index"));
 
         }
-        // merge output ffindex databases
         DBWriter::mergeResults(outname , outname + ".index", splitFiles);
     }
 
diff --git a/src/util/result2profile.cpp b/src/util/result2profile.cpp
index 17a5838..7e3529e 100644
--- a/src/util/result2profile.cpp
+++ b/src/util/result2profile.cpp
@@ -80,7 +80,7 @@ int result2profile(DBReader<unsigned int> &resultReader, Parameters &par, const
     // adjust score of each match state by -0.2 to trim alignment
     SubstitutionMatrix subMat(par.scoringMatrixFile.aminoacids, 2.0f, -0.2f);
     ProbabilityMatrix probMatrix(subMat);
-    EvalueComputation evalueComputation(tDbr->getAminoAcidDBSize(), &subMat, par.gapOpen, par.gapExtend);
+    EvalueComputation evalueComputation(tDbr->getAminoAcidDBSize(), &subMat, par.gapOpen.aminoacids, par.gapExtend.aminoacids);
 
     if (qDbr->getDbtype() == -1 || targetSeqType == -1) {
         Debug(Debug::ERROR) << "Please recreate your database or add a .dbtype file to your sequence/profile database\n";
@@ -96,7 +96,7 @@ int result2profile(DBReader<unsigned int> &resultReader, Parameters &par, const
     Debug(Debug::INFO) << "Target database size: " << tDbr->getSize() << " type: " << Parameters::getDbTypeName(targetSeqType) << "\n";
 
     const bool isFiltering = par.filterMsa != 0;
-    int xAmioAcid = subMat.aa2int[(int) 'X'];
+    int xAmioAcid = subMat.aa2num[static_cast<int>('X')];
     Debug::Progress progress(dbSize);
 #pragma omp parallel num_threads(localThreads)
     {
@@ -105,10 +105,10 @@ int result2profile(DBReader<unsigned int> &resultReader, Parameters &par, const
         thread_idx = (unsigned int) omp_get_thread_num();
 #endif
 
-        Matcher matcher(qDbr->getDbtype(), maxSequenceLength, &subMat, &evalueComputation, par.compBiasCorrection, par.gapOpen, par.gapExtend);
+        Matcher matcher(qDbr->getDbtype(), maxSequenceLength, &subMat, &evalueComputation, par.compBiasCorrection, par.gapOpen.aminoacids, par.gapExtend.aminoacids);
         MultipleAlignment aligner(maxSequenceLength, maxSetSize, &subMat, &matcher);
         PSSMCalculator calculator(&subMat, maxSequenceLength, maxSetSize, par.pca, par.pcb);
-        MsaFilter filter(maxSequenceLength, maxSetSize, &subMat, par.gapOpen, par.gapExtend);
+        MsaFilter filter(maxSequenceLength, maxSetSize, &subMat, par.gapOpen.aminoacids, par.gapExtend.aminoacids);
         Sequence centerSequence(maxSequenceLength, qDbr->getDbtype(), &subMat, 0, false, par.compBiasCorrection);
         std::string result;
         result.reserve((maxSequenceLength + 1) * Sequence::PROFILE_READIN_SIZE);
@@ -176,11 +176,9 @@ int result2profile(DBReader<unsigned int> &resultReader, Parameters &par, const
 
             size_t filteredSetSize = res.setSize;
             if (isFiltering) {
-                filter.filter(res.setSize, res.centerLength, static_cast<int>(par.covMSAThr * 100),
+                filteredSetSize = filter.filter(res, static_cast<int>(par.covMSAThr * 100),
                               static_cast<int>(par.qid * 100), par.qsc,
-                              static_cast<int>(par.filterMaxSeqId * 100), par.Ndiff,
-                              (const char **) res.msaSequence, &filteredSetSize);
-                filter.shuffleSequences((const char **) res.msaSequence, res.setSize);
+                              static_cast<int>(par.filterMaxSeqId * 100), par.Ndiff);
             }
             //MultipleAlignment::print(res, &subMat);
 
@@ -194,7 +192,7 @@ int result2profile(DBReader<unsigned int> &resultReader, Parameters &par, const
             PSSMCalculator::Profile pssmRes = calculator.computePSSMFromMSA(filteredSetSize, res.centerLength, (const char **) res.msaSequence, par.wg);
             if (par.maskProfile == true) {
                 for (int i = 0; i < centerSequence.L; ++i) {
-                    charSequence[i] = (char) centerSequence.int_sequence[i];
+                    charSequence[i] = (unsigned char ) centerSequence.numSequence[i];
                 }
 
                 tantan::maskSequences(charSequence, charSequence + centerSequence.L,
@@ -222,8 +220,8 @@ int result2profile(DBReader<unsigned int> &resultReader, Parameters &par, const
                     result.push_back(Sequence::scoreMask(pssmRes.prob[pos * Sequence::PROFILE_AA_SIZE + aa]));
                 }
                 // write query, consensus sequence and neffM
-                result.push_back(static_cast<unsigned char>(centerSequence.int_sequence[pos]));
-                result.push_back(static_cast<unsigned char>(subMat.aa2int[static_cast<int>(pssmRes.consensus[pos])]));
+                result.push_back(static_cast<unsigned char>(centerSequence.numSequence[pos]));
+                result.push_back(static_cast<unsigned char>(subMat.aa2num[static_cast<int>(pssmRes.consensus[pos])]));
                 unsigned char neff = MathUtil::convertNeffToChar(pssmRes.neffM[pos]);
                 result.push_back(neff);
             }
diff --git a/src/util/reverseseq.cpp b/src/util/reverseseq.cpp
index 8334167..a8b072d 100644
--- a/src/util/reverseseq.cpp
+++ b/src/util/reverseseq.cpp
@@ -20,6 +20,8 @@ int reverseseq(int argn, const char **argv, const Command& command) {
     revSeqWriter.open();
     Debug::Progress progress(seqReader.getSize());
 
+    bool isProfileInput = Parameters::isEqualDbtype(seqReader.getDbtype(), Parameters::DBTYPE_HMM_PROFILE);
+
 #pragma omp parallel
     {
         unsigned int thread_idx = 0;
@@ -35,10 +37,21 @@ int reverseseq(int argn, const char **argv, const Command& command) {
             unsigned int seqKey = seqReader.getDbKey(id);
             char *seq = seqReader.getData(id, thread_idx);
             size_t lenSeq = seqReader.getSeqLen(id);
+
             for (size_t i = 0; i < lenSeq; ++i) {
-                revStr.push_back(seq[lenSeq - i - 1]);
+                size_t revInd = lenSeq - i - 1;
+                if (isProfileInput) {
+                    revStr.append(seq + (Sequence::PROFILE_READIN_SIZE * revInd), Sequence::PROFILE_READIN_SIZE);
+                } else {
+                    revStr.push_back(seq[revInd]);
+                }
             }
-            revStr.push_back('\n');
+            
+            // for seqdb add \n
+            if (! isProfileInput) {
+                revStr.push_back('\n');
+            }
+
             revSeqWriter.writeData(revStr.c_str(), revStr.size(), seqKey, thread_idx, true);
             revStr.clear();
         }
diff --git a/src/util/sortresult.cpp b/src/util/sortresult.cpp
index 3d9d325..2f691e7 100644
--- a/src/util/sortresult.cpp
+++ b/src/util/sortresult.cpp
@@ -12,8 +12,6 @@
 
 int sortresult(int argc, const char **argv, const Command &command) {
     Parameters &par = Parameters::getInstance();
-    par.overrideParameterDescription((Command &) command, par.PARAM_MAX_SEQS.uniqid, "maximum result sequences per query", NULL,
-                                     par.PARAM_MAX_SEQS.category & ~MMseqsParameter::COMMAND_EXPERT);
     par.parseParameters(argc, argv, command, true, 0, 0);
 
     DBReader<unsigned int> reader(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
diff --git a/src/util/splitsequence.cpp b/src/util/splitsequence.cpp
index a1c7b2c..60ec7ae 100644
--- a/src/util/splitsequence.cpp
+++ b/src/util/splitsequence.cpp
@@ -67,7 +67,7 @@ int splitsequence(int argc, const char **argv, const Command& command) {
         if (querySize == 0) {
             queryFrom = 0;
         }
-        char buffer[LINE_MAX];
+        char buffer[1024];
 
         for (unsigned int i = queryFrom; i < (queryFrom + querySize); ++i){
             progress.updateProgress();
@@ -126,13 +126,12 @@ int splitsequence(int argc, const char **argv, const Command& command) {
         {
 #pragma omp task
             {
-                DBWriter::createRenumberedDB(par.hdr2, par.hdr2Index, "");
+                DBWriter::createRenumberedDB(par.hdr2, par.hdr2Index, "", "");
             }
 
 #pragma omp task
             {
-                std::string lookup = par.db1 + ".lookup";
-                DBWriter::createRenumberedDB(par.db2, par.db2Index, par.createLookup ? lookup : "");
+                DBWriter::createRenumberedDB(par.db2, par.db2Index, par.createLookup ? par.db1 : "", par.createLookup ? par.db1Index : "");
             }
         }
     }
diff --git a/src/util/summarizealis.cpp b/src/util/summarizealis.cpp
index eafab8b..a6e2fa6 100644
--- a/src/util/summarizealis.cpp
+++ b/src/util/summarizealis.cpp
@@ -41,7 +41,7 @@ int summarizealis(int argc, const char **argv, const Command &command) {
 
             char *data = reader.getData(i, thread_idx);
             Matcher::readAlignmentResults(alnResults, data);
-            if (alnResults.size() == 0) {
+            if (alnResults.empty()) {
                 writer.writeData("", 0, reader.getDbKey(i), thread_idx);
                 continue;
             }
diff --git a/src/util/summarizeresult.cpp b/src/util/summarizeresult.cpp
index ea04f41..c500b89 100644
--- a/src/util/summarizeresult.cpp
+++ b/src/util/summarizeresult.cpp
@@ -69,30 +69,17 @@ int summarizeresult(int argc, const char **argv, const Command &command) {
                     Debug(Debug::WARNING) << "Query alignment start or end is greater than query length! Skipping line.\n";
                     continue;
                 }
-                if (domain.qStartPos > domain.qEndPos) {
-                    Debug(Debug::WARNING) << "Query alignment end is greater than start! Skipping line.\n";
-                    continue;
-                }
-                if (domain.dbStartPos > domain.dbEndPos) {
-                    Debug(Debug::WARNING) << "Target alignment end is greater than start! Skipping line.\n";
-                    continue;
-                }
-                if (domain.dbStartPos > static_cast<int>(domain.dbLen) || domain.dbEndPos > static_cast<int>(domain.dbLen)) {
-                    Debug(Debug::WARNING) << "Target alignment start or end is greater than target length! Skipping line.\n";
-                    continue;
-                }
-
                 if (domain.dbcov <= par.covThr) {
                     continue;
                 }
 
                 size_t counter = 0;
-                for (int j = domain.qStartPos; j < domain.qEndPos; ++j) {
+                for (int j = std::min(domain.qStartPos, domain.qEndPos); j < std::max(domain.qStartPos, domain.qEndPos); ++j) {
                     counter += covered[j] ? 1 : 0;
                 }
-                const float percentageOverlap = static_cast<float>(counter) / static_cast<float>(domain.qEndPos - domain.qStartPos + 1);
+                const float percentageOverlap = static_cast<float>(counter) / static_cast<float>(std::max(domain.qStartPos, domain.qEndPos) - std::min(domain.qStartPos, domain.qEndPos) + 1);
                 if (percentageOverlap <= par.overlap) {
-                    for (int j = domain.qStartPos; j < domain.qEndPos; ++j) {
+                    for (int j = std::min(domain.qStartPos, domain.qEndPos); j < std::max(domain.qStartPos, domain.qEndPos); ++j) {
                         covered[j] = true;
                     }
                     size_t len = Matcher::resultToBuffer(buffer, domain, par.addBacktrace, false);
@@ -117,3 +104,4 @@ int summarizeresult(int argc, const char **argv, const Command &command) {
 
     return EXIT_SUCCESS;
 }
+
diff --git a/src/util/summarizetabs.cpp b/src/util/summarizetabs.cpp
index 21d34e3..dfe7961 100644
--- a/src/util/summarizetabs.cpp
+++ b/src/util/summarizetabs.cpp
@@ -27,7 +27,7 @@ static inline float getOverlap(const std::vector<bool>& covered, unsigned int qS
 std::vector<Domain> mapDomains(const std::vector<Domain> &input, float overlap, float minCoverage,
                                double eValThreshold) {
     std::vector<Domain> result;
-    if(input.size() == 0) {
+    if (input.empty()) {
         return result;
     }
 
@@ -149,13 +149,13 @@ int doAnnotate(Parameters &par, DBReader<unsigned int> &blastTabReader,
             char *tabData = blastTabReader.getData(i, thread_idx);
             size_t tabLength = blastTabReader.getEntryLen(i) - 1;
             const std::vector<Domain> entries = getEntries(id, tabData, tabLength, lengths);
-            if (entries.size() == 0) {
+            if (entries.empty()) {
                 Debug(Debug::WARNING) << "Can not map any entries for entry " << id << "!\n";
                 continue;
             }
 
             std::vector<Domain> result = mapDomains(entries, par.overlap, par.covThr, par.evalThr);
-            if (result.size() == 0) {
+            if (result.empty()) {
                 Debug(Debug::WARNING) << "Can not map any domains for entry " << id << "!\n";
                 continue;
             }
diff --git a/src/util/swapresults.cpp b/src/util/swapresults.cpp
index ef6bfae..5d48c3e 100644
--- a/src/util/swapresults.cpp
+++ b/src/util/swapresults.cpp
@@ -38,6 +38,8 @@ int doswap(Parameters& par, bool isGeneralMode) {
     std::string parOutDbStr(parOutDb);
     std::string parOutDbIndexStr(parOutDbIndex);
 
+    BaseMatrix *subMat = NULL;
+    EvalueComputation *evaluer = NULL;
     size_t aaResSize = 0;
     unsigned int maxTargetId = 0;
     char *targetElementExists = NULL;
@@ -82,9 +84,19 @@ int doswap(Parameters& par, bool isGeneralMode) {
             unsigned int key = target.sequenceReader->getDbKey(i);
             targetElementExists[key] = 1;
         }
+        int gapOpen, gapExtend;
+        if (Parameters::isEqualDbtype(target.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)) {
+            subMat = new NucleotideMatrix(par.scoringMatrixFile.nucleotides, 1.0, 0.0);
+            gapOpen = par.gapOpen.nucleotides;
+            gapExtend = par.gapExtend.nucleotides;
+        } else {
+            // keep score bias at 0.0 (improved ROC)
+            subMat = new SubstitutionMatrix(par.scoringMatrixFile.aminoacids, 2.0, 0.0);
+            gapOpen = par.gapOpen.aminoacids;
+            gapExtend = par.gapExtend.aminoacids;
+        }
+        evaluer = new EvalueComputation(aaResSize, subMat, gapOpen, gapExtend);
     }
-    SubstitutionMatrix subMat(par.scoringMatrixFile.aminoacids, 2.0, 0.0);
-    EvalueComputation evaluer(aaResSize, &subMat, par.gapOpen, par.gapExtend);
 
     DBReader<unsigned int> resultDbr(parResultDb, parResultDbIndex, par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
     resultDbr.open(DBReader<unsigned int>::SORT_BY_OFFSET);
@@ -261,7 +273,7 @@ int doswap(Parameters& par, bool isGeneralMode) {
                 while (dataSize > 0) {
                     if (isAlignmentResult) {
                         Matcher::result_t res = Matcher::parseAlignmentRecord(data, true);
-                        Matcher::result_t::swapResult(res, evaluer, hasBacktrace);
+                        Matcher::result_t::swapResult(res, *evaluer, hasBacktrace);
                         if (res.eval > par.evalThr) {
                             evalBreak = true;
                             goto outer;
@@ -300,7 +312,7 @@ int doswap(Parameters& par, bool isGeneralMode) {
                     }
 
                     resultWriter.writeData(ss.c_str(), ss.size(), i, thread_idx);
-                    ss = "";
+                    ss.clear();
 
                     curRes.clear();
                 } else if (evalBreak == true || targetElementExists[i] == 1) {
@@ -323,6 +335,14 @@ int doswap(Parameters& par, bool isGeneralMode) {
         DBWriter::mergeResults(parOutDbStr, parOutDbIndexStr, splitFileNames);
     }
 
+    if (evaluer != NULL) {
+        delete evaluer;
+    }
+
+    if (subMat != NULL) {
+        delete subMat;
+    }
+
     resultDbr.close();
     if (targetElementExists != NULL) {
         delete[] targetElementExists;
diff --git a/src/util/tar2db.cpp b/src/util/tar2db.cpp
new file mode 100644
index 0000000..592c046
--- /dev/null
+++ b/src/util/tar2db.cpp
@@ -0,0 +1,244 @@
+#include "FileUtil.h"
+#include "DBWriter.h"
+#include "Debug.h"
+#include "Util.h"
+#include "PatternCompiler.h"
+
+#include "microtar.h"
+
+#ifdef HAVE_ZLIB
+#include <zlib.h>
+static int file_gzwrite(mtar_t *tar, const void *data, size_t size) {
+    size_t res = gzwrite((gzFile)tar->stream, data, size);
+    return (res == size) ? MTAR_ESUCCESS : MTAR_EWRITEFAIL;
+}
+
+static int file_gzread(mtar_t *tar, void *data, size_t size) {
+    size_t res = gzread((gzFile)tar->stream, data, size);
+    return (res == size) ? MTAR_ESUCCESS : MTAR_EREADFAIL;
+}
+
+static int file_gzseek(mtar_t *tar, long offset) {
+    int res = gzseek((gzFile)tar->stream, offset, SEEK_SET);
+    return (res != -1) ? MTAR_ESUCCESS : MTAR_ESEEKFAIL;
+}
+
+static int file_gzclose(mtar_t *tar) {
+    gzclose((gzFile)tar->stream);
+    return MTAR_ESUCCESS;
+}
+
+int mtar_gzopen(mtar_t *tar, const char *filename, const char *mode) {
+    // Init tar struct and functions
+    memset(tar, 0, sizeof(*tar));
+    tar->write = file_gzwrite;
+    tar->read = file_gzread;
+    tar->seek = file_gzseek;
+    tar->close = file_gzclose;
+
+    // Assure mode is always binary
+    if (strchr(mode, 'r')) mode = "rb";
+    if (strchr(mode, 'w')) mode = "wb";
+    if (strchr(mode, 'a')) mode = "ab";
+
+    // Open file
+    tar->stream = gzopen(filename, mode);
+    if (!tar->stream) {
+        return MTAR_EOPENFAIL;
+    }
+
+    // Read first header to check it is valid if mode is `r`
+    if (*mode == 'r') {
+        mtar_header_t h;
+        int err = mtar_read_header(tar, &h);
+        if (err != MTAR_ESUCCESS) {
+            mtar_close(tar);
+            return err;
+        }
+    }
+
+    // Return ok
+    return MTAR_ESUCCESS;
+}
+#endif
+
+#ifdef HAVE_BZLIB
+#include <bzlib.h>
+#endif
+
+int tar2db(int argc, const char **argv, const Command& command) {
+    Parameters &par = Parameters::getInstance();
+    par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0);
+
+    std::vector<std::string> filenames(par.filenames);
+    for (size_t i = 0; i < filenames.size(); i++) {
+        if (FileUtil::directoryExists(filenames[i].c_str()) == true) {
+            Debug(Debug::ERROR) << "File " << filenames[i] << " is a directory.\n";
+            EXIT(EXIT_FAILURE);
+        }
+    }
+
+    PatternCompiler include(par.tarInclude.c_str());
+    PatternCompiler exclude(par.tarExclude.c_str());
+
+    std::string dataFile = filenames.back();
+    filenames.pop_back();
+    std::string indexFile = dataFile + ".index";
+
+    std::string sourceFile = dataFile + ".source";
+    FILE *source = FileUtil::openAndDelete(sourceFile.c_str(), "w");
+
+    std::string lookupFile = dataFile + ".lookup";
+    FILE *lookup = FileUtil::openAndDelete(lookupFile.c_str(), "w");
+
+    DBWriter writer(dataFile.c_str(), indexFile.c_str(), 1, par.compressed, par.outputDbType);
+    writer.open();
+    Debug::Progress progress;
+    char buffer[4096];
+
+#ifdef HAVE_ZLIB
+    const unsigned int CHUNK = 128 * 1024;
+    unsigned char in[CHUNK];
+    unsigned char out[CHUNK];
+    z_stream strm;
+    memset(&strm, 0, sizeof(z_stream));
+    strm.zalloc = Z_NULL;
+    strm.zfree = Z_NULL;
+    strm.opaque = Z_NULL;
+    strm.next_in = in;
+    strm.avail_in = 0;
+    int status = inflateInit2(&strm, 15 | 32);
+    if (status < 0) {
+        Debug(Debug::ERROR) << "Cannot initialize zlib stream\n";
+        EXIT(EXIT_FAILURE);
+    }
+#endif
+
+    size_t key = 0;
+    for (size_t i = 0; i < filenames.size(); i++) {
+        size_t len = snprintf(buffer, sizeof(buffer), "%zu\t%s\n", i, FileUtil::baseName(filenames[i]).c_str());
+        int written = fwrite(buffer, sizeof(char), len, source);
+        if (written != (int) len) {
+            Debug(Debug::ERROR) << "Cannot write to source file " << sourceFile << "\n";
+            EXIT(EXIT_FAILURE);
+        }
+
+        mtar_t tar;
+        if (Util::endsWith(".tar.gz", filenames[i]) || Util::endsWith(".tgz", filenames[i])) {
+#ifdef HAVE_ZLIB
+            if (mtar_gzopen(&tar, filenames[i].c_str(), "r") != MTAR_ESUCCESS) {
+                Debug(Debug::ERROR) << "Cannot open file " << filenames[i] << "\n";
+                EXIT(EXIT_FAILURE);
+            }
+#else
+            Debug(Debug::ERROR) << "MMseqs2 was not compiled with zlib support. Cannot read compressed input.\n";
+            EXIT(EXIT_FAILURE);
+#endif
+        } else {
+            if (mtar_open(&tar, filenames[i].c_str(), "r") != MTAR_ESUCCESS) {
+                Debug(Debug::ERROR) << "Cannot open file " << filenames[i] << "\n";
+                EXIT(EXIT_FAILURE);
+            }
+        }
+
+        size_t bufferSize = 10 * 1024;
+        char* dataBuffer = (char*) malloc(bufferSize);
+
+        size_t inflateSize = 10 * 1024;
+        char* inflateBuffer = (char*) malloc(inflateSize);
+
+        mtar_header_t header;
+        while ((mtar_read_header(&tar, &header)) != MTAR_ENULLRECORD ) {
+            if (header.type != MTAR_TREG) {
+                mtar_next(&tar);
+                continue;
+            }
+            progress.updateProgress();
+            if (include.isMatch(header.name) == false || exclude.isMatch(header.name) == true) {
+                key++;
+                mtar_next(&tar);
+                continue;
+            }
+            if (header.size > bufferSize) {
+                bufferSize = header.size * 1.5;
+                dataBuffer = (char*)realloc(dataBuffer, bufferSize);
+            }
+            if (mtar_read_data(&tar, dataBuffer, header.size) != MTAR_ESUCCESS) {
+                Debug(Debug::ERROR) << "Cannot read entry " << header.name << "\n";
+                EXIT(EXIT_FAILURE);
+            }
+
+            if (Util::endsWith(".gz", header.name)) {
+#ifdef HAVE_ZLIB
+                inflateReset(&strm);
+                writer.writeStart(0);
+                strm.avail_in = header.size;
+                strm.next_in = (unsigned char*)dataBuffer;
+                do {
+                    unsigned have;
+                    strm.avail_out = CHUNK;
+                    strm.next_out = out;
+                    int err = inflate(&strm, Z_NO_FLUSH);
+                    switch (err) {
+                        case Z_OK:
+                        case Z_STREAM_END:
+                        case Z_BUF_ERROR:
+                            break;
+                        default:
+                            inflateEnd(&strm);
+                            Debug(Debug::ERROR) << "Gzip error " << err << " entry " << header.name << "\n";
+                            EXIT(EXIT_FAILURE);
+                    }
+                    have = CHUNK - strm.avail_out;
+                    writer.writeAdd((const char*)out, have, 0);
+                } while (strm.avail_out == 0);
+                writer.writeEnd(key, 0);
+#else
+                Debug(Debug::ERROR) << "MMseqs2 was not compiled with zlib support. Cannot read compressed input.\n";
+                EXIT(EXIT_FAILURE);
+#endif
+            } else if (Util::endsWith(".bz2", header.name)) {
+#ifdef HAVE_BZLIB
+                unsigned int entrySize = inflateSize;
+                int err;
+                while ((err = BZ2_bzBuffToBuffDecompress(inflateBuffer, &entrySize, dataBuffer, header.size, 0, 0) == BZ_OUTBUFF_FULL)) {
+                    entrySize = inflateSize = inflateSize * 1.5;
+                    inflateBuffer = (char*)realloc(inflateBuffer, inflateSize);
+                }
+                if (err != BZ_OK) {
+                    Debug(Debug::ERROR) << "Could not decompress " << header.name  << "\n";
+                    EXIT(EXIT_FAILURE);
+                }
+                writer.writeData(inflateBuffer, entrySize, key, 0);
+#else
+                Debug(Debug::ERROR) << "MMseqs2 was not compiled with bzlib support. Cannot read compressed input.\n";
+                EXIT(EXIT_FAILURE);
+#endif
+            } else {
+                writer.writeData(dataBuffer, header.size, key, 0);
+            }
+            size_t len = snprintf(buffer, sizeof(buffer), "%zu\t%s\t%zu\n", key, FileUtil::baseName(header.name).c_str(), i);
+            int written = fwrite(buffer, sizeof(char), len, lookup);
+            if (written != (int) len) {
+                Debug(Debug::ERROR) << "Cannot write to lookup file " << lookupFile << "\n";
+                EXIT(EXIT_FAILURE);
+            }
+            key++;
+            mtar_next(&tar);
+        }
+
+        free(inflateBuffer);
+        free(dataBuffer);
+
+        mtar_close(&tar);
+    }
+    fclose(lookup);
+    fclose(source);
+    writer.close();
+
+#ifdef HAVE_ZLIB
+    inflateEnd(&strm);
+#endif
+
+    return EXIT_SUCCESS;
+}
diff --git a/src/util/transitivealign.cpp b/src/util/transitivealign.cpp
index 2ae62d1..94bae80 100644
--- a/src/util/transitivealign.cpp
+++ b/src/util/transitivealign.cpp
@@ -121,7 +121,7 @@ int transitivealign(int argc, const char **argv, const Command &command) {
     DBWriter resultWriter(tmpRes.c_str(), tmpResIndex.c_str(), par.threads, par.compressed, Parameters::DBTYPE_ALIGNMENT_RES);
     resultWriter.open();
 
-    EvalueComputation evaluer(sequenceDbr.getAminoAcidDBSize(), subMat, par.gapOpen, par.gapExtend);
+    EvalueComputation evaluer(sequenceDbr.getAminoAcidDBSize(), subMat, par.gapOpen.aminoacids, par.gapExtend.aminoacids);
     const size_t flushSize = 100000000;
     size_t iterations = static_cast<int>(ceil(static_cast<double>(alnReader.getSize()) / static_cast<double>(flushSize)));
     for (size_t i = 0; i < iterations; i++) {
@@ -135,7 +135,7 @@ int transitivealign(int argc, const char **argv, const Command &command) {
             thread_idx = (unsigned int) omp_get_thread_num();
 #endif
 
-            Matcher matcher(querySeqType, par.maxSeqLen, subMat, &evaluer, par.compBiasCorrection, par.gapOpen, par.gapExtend);
+            Matcher matcher(querySeqType, par.maxSeqLen, subMat, &evaluer, par.compBiasCorrection, par.gapOpen.aminoacids, par.gapExtend.aminoacids, par.zdrop);
 
 //            Sequence query(par.maxSeqLen, targetSeqType, subMat, par.kmerSize, par.spacedKmer, par.compBiasCorrection);
 //            Sequence target(par.maxSeqLen, targetSeqType, subMat, par.kmerSize, par.spacedKmer, par.compBiasCorrection);
@@ -208,7 +208,7 @@ int transitivealign(int argc, const char **argv, const Command &command) {
 //                            result.backtrace.push_back('M');
                         }else{
                             btTranslate.translateResult(swappedResult, results[entryIdx_j], result);
-                            updateResultByRescoringBacktrace(querySeq, targetSeq, fastMatrix.matrix, evaluer, par.gapOpen, par.gapExtend, result);
+                            updateResultByRescoringBacktrace(querySeq, targetSeq, fastMatrix.matrix, evaluer, par.gapOpen.aminoacids, par.gapExtend.aminoacids, result);
                         }
                         // checkCriteria and Util::canBeCovered always work together
                         if (Alignment::checkCriteria(result, isIdentity, par.evalThr, par.seqIdThr, par.alnLenThr, par.covMode, par.covThr)) {
diff --git a/src/util/translateaa.cpp b/src/util/translateaa.cpp
index 3281b63..0db8d62 100644
--- a/src/util/translateaa.cpp
+++ b/src/util/translateaa.cpp
@@ -37,7 +37,7 @@ int translateaa(int argc, const char **argv, const Command &command) {
                     data[1] = nucLookup[nuc2];
                     data[2] = nucLookup[nuc3];
                     translateNucl.translate(writeAA, data, 3);
-                    if (writeAA[0] == subMat.int2aa[i]) {
+                    if (writeAA[0] == subMat.num2aa[i]) {
                         lookupAA[i][0] = data[0];
                         lookupAA[i][1] = data[1];
                         lookupAA[i][2] = data[2];
@@ -73,7 +73,7 @@ int translateaa(int argc, const char **argv, const Command &command) {
 
             // ignore null char at the end
             for (int pos = 0; pos < aaSequence.L; ++pos) {
-                nucSeq.append(lookupAA[aaSequence.int_sequence[pos]], 3);
+                nucSeq.append(lookupAA[aaSequence.numSequence[pos]], 3);
             }
 
             nucSeq.append(1, '\n');
diff --git a/src/util/translatenucs.cpp b/src/util/translatenucs.cpp
index 79828a8..5b419cc 100644
--- a/src/util/translatenucs.cpp
+++ b/src/util/translatenucs.cpp
@@ -65,7 +65,7 @@ int translatenucs(int argc, const char **argv, const Command& command) {
             // needs to be int in order to be able to check
             size_t length = reader.getEntryLen(i) - 1;
             if ((data[length] != '\n' && length % 3 != 0) && (data[length - 1] == '\n' && (length - 1) % 3 != 0)) {
-                Debug(Debug::WARNING) << "Nucleotide sequence entry " << key << " length (" << length << ") is not divisible by three. Adjust length to (lenght=" <<  length - (length % 3) << ").\n";
+                Debug(Debug::WARNING) << "Nucleotide sequence entry " << key << " length (" << length << ") is not divisible by three. Adjust length to (length=" <<  length - (length % 3) << ").\n";
                 length = length - (length % 3);
             }
 
diff --git a/src/util/view.cpp b/src/util/view.cpp
index 2525d9b..e4cfbc7 100644
--- a/src/util/view.cpp
+++ b/src/util/view.cpp
@@ -1,24 +1,14 @@
-//
-// Created by Martin Steinegger on 2019-01-17.
-//
-
 #include "Parameters.h"
-#include "FileUtil.h"
-#include "DBReader.h"
-#include "DBWriter.h"
+#include "IndexReader.h"
 #include "Debug.h"
 #include "Util.h"
 
-#include <climits>
-#include <IndexReader.h>
-
 int view(int argc, const char **argv, const Command& command) {
     Parameters& par = Parameters::getInstance();
-    par.verbosity = 1;
-    par.parseParameters(argc, argv, command, true, 0, 0);
+    par.parseParameters(argc, argv, command, false, 0, 0);
     std::vector<std::string> ids = Util::split(par.idList, ",");
     int indexSrcType = IndexReader::SEQUENCES;
-    switch(par.idxEntryType){
+    switch (par.idxEntryType) {
         case 0:
             indexSrcType = IndexReader::SEQUENCES;
             break;
@@ -33,19 +23,16 @@ int view(int argc, const char **argv, const Command& command) {
             break;
     }
     IndexReader reader(par.db1, par.threads, indexSrcType, 0);
-    char dbKey[256];
-    for (size_t i = 0; i< ids.size(); i++) {
-        strncpy(dbKey, ids[i].c_str(), ids[i].size());
-        dbKey[ids[i].size()]='\0';
-        const unsigned int key = Util::fast_atoi<unsigned int>(dbKey);
+    for (size_t i = 0; i < ids.size(); ++i) {
+        const unsigned int key = Util::fast_atoi<unsigned int>(ids[i].c_str());
         const size_t id = reader.sequenceReader->getId(key);
         if (id >= UINT_MAX) {
-            Debug(Debug::WARNING) << "Key " << ids[i] << " not found in database\n";
+            Debug(Debug::ERROR) << "Key " << ids[i] << " not found in database\n";
             continue;
         }
         char* data = reader.sequenceReader->getData(id, 0);
-        std::cout << data;
+        size_t size = reader.sequenceReader->getEntryLen(id) - 1;
+        fwrite(data, sizeof(char), size, stdout);
     }
     EXIT(EXIT_SUCCESS);
-    return EXIT_SUCCESS;
 }
diff --git a/src/version/CMakeLists.txt b/src/version/CMakeLists.txt
index a0119e0..ad3540f 100644
--- a/src/version/CMakeLists.txt
+++ b/src/version/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_library(version Version.cpp)
+set_target_properties(version PROPERTIES COMPILE_FLAGS "${MMSEQS_CXX_FLAGS}" LINK_FLAGS "${MMSEQS_CXX_FLAGS}")
 
 if (VERSION_OVERRIDE)
     target_compile_definitions(version PRIVATE -DGIT_SHA1=${VERSION_OVERRIDE})
diff --git a/src/workflow/CMakeLists.txt b/src/workflow/CMakeLists.txt
index 63b5d76..ce266e9 100644
--- a/src/workflow/CMakeLists.txt
+++ b/src/workflow/CMakeLists.txt
@@ -1,8 +1,10 @@
 set(workflow_source_files
         workflow/Cluster.cpp
         workflow/ClusterUpdate.cpp
+        workflow/Databases.cpp
         workflow/Linclust.cpp
         workflow/EasySearch.cpp
+        workflow/EasyRbh.cpp
         workflow/EasyCluster.cpp
         workflow/EasyLinclust.cpp
         workflow/Enrich.cpp
diff --git a/src/workflow/Cluster.cpp b/src/workflow/Cluster.cpp
index ea63923..bc17d82 100644
--- a/src/workflow/Cluster.cpp
+++ b/src/workflow/Cluster.cpp
@@ -6,6 +6,7 @@
 #include "FileUtil.h"
 
 #include "cascaded_clustering.sh.h"
+#include "nucleotide_clustering.sh.h"
 #include "clustering.sh.h"
 
 #include <cassert>
@@ -38,41 +39,53 @@ int setAutomaticIterations(float sens){
     }
 }
 
+
+void setNuclClusterDefaults(Parameters *p) {
+    // leave ungapped alignment untouched
+    if(p->alignmentMode != Parameters::ALIGNMENT_MODE_UNGAPPED){
+        p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID;
+    }
+    //p->orfLongest = true;
+    p->exactKmerMatching = true;
+    if ( p->PARAM_DIAGONAL_SCORING.wasSet == false) {
+        p->diagonalScoring = 0;
+    }
+    if ( p->PARAM_STRAND.wasSet == false) {
+        p->strand = 2;
+    }
+    if ( p->PARAM_K.wasSet == false) {
+        p->kmerSize = 15;
+    }
+    if (  p->PARAM_MAX_SEQ_LEN.wasSet == false) {
+        p->maxSeqLen = 10000;
+    }
+}
+
+
+
 int clusteringworkflow(int argc, const char **argv, const Command& command) {
     Parameters& par = Parameters::getInstance();
     setWorkflowDefaults(&par);
-    par.overrideParameterDescription((Command &)command, par.PARAM_RESCORE_MODE.uniqid, NULL, NULL, par.PARAM_RESCORE_MODE.category |MMseqsParameter::COMMAND_EXPERT );
-    par.overrideParameterDescription((Command &)command, par.PARAM_MAX_REJECTED.uniqid, NULL, NULL, par.PARAM_MAX_REJECTED.category |MMseqsParameter::COMMAND_EXPERT );
-    par.overrideParameterDescription((Command &)command, par.PARAM_MAX_ACCEPT.uniqid, NULL, NULL, par.PARAM_MAX_ACCEPT.category |MMseqsParameter::COMMAND_EXPERT );
-    par.overrideParameterDescription((Command &)command, par.PARAM_KMER_PER_SEQ.uniqid, NULL, NULL, par.PARAM_KMER_PER_SEQ.category |MMseqsParameter::COMMAND_EXPERT );
-    par.overrideParameterDescription((Command &)command, par.PARAM_S.uniqid, "sensitivity will be automatically determined but can be adjusted", NULL,  par.PARAM_S.category |MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &)command, par.PARAM_INCLUDE_ONLY_EXTENDABLE.uniqid, NULL, NULL, par.PARAM_INCLUDE_ONLY_EXTENDABLE.category |MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_KMER_PER_SEQ.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_S.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_INCLUDE_ONLY_EXTENDABLE.addCategory(MMseqsParameter::COMMAND_EXPERT);
 
     par.parseParameters(argc, argv, command, true, 0, 0);
-
-    bool sensitivitySet = false;
-    bool compositionBiasSet = false;
-    bool clusterModeSet = false;
-    bool clusterStepsSet = false;
-    bool minDiagonalScoreSet = false;
-
-    for (size_t i = 0; i < par.clusterworkflow.size(); i++) {
-        if (par.clusterworkflow[i]->uniqid == par.PARAM_S.uniqid && par.clusterworkflow[i]->wasSet) {
-            sensitivitySet = true;
-        }
-        if (par.clusterworkflow[i]->uniqid == par.PARAM_CLUSTER_MODE.uniqid && par.clusterworkflow[i]->wasSet) {
-            clusterModeSet = true;
-        }
-        if (par.clusterworkflow[i]->uniqid == par.PARAM_CLUSTER_STEPS.uniqid && par.clusterworkflow[i]->wasSet) {
-            clusterStepsSet = true;
-        }
-        if (par.clusterworkflow[i]->uniqid == par.PARAM_NO_COMP_BIAS_CORR.uniqid && par.clusterworkflow[i]->wasSet) {
-            compositionBiasSet = true;
-        }
-        if (par.clusterworkflow[i]->uniqid == par.PARAM_MIN_DIAG_SCORE.uniqid && par.clusterworkflow[i]->wasSet) {
-            minDiagonalScoreSet = true;
-        }
+    const int dbType = FileUtil::parseDbType(par.db1.c_str());
+    bool isNucleotideDb = (Parameters::isEqualDbtype(dbType, Parameters::DBTYPE_NUCLEOTIDES));
+    if(isNucleotideDb){
+        setNuclClusterDefaults(&par);
     }
+    bool sensitivitySet = par.PARAM_S.wasSet;
+    bool compositionBiasSet = par.PARAM_NO_COMP_BIAS_CORR.wasSet;
+    bool clusterModeSet = par.PARAM_CLUSTER_MODE.wasSet;
+    bool clusterStepsSet = par.PARAM_CLUSTER_STEPS.wasSet;
+    bool minDiagonalScoreSet = par.PARAM_MIN_DIAG_SCORE.wasSet;
 
     if (compositionBiasSet == false) {
         if(par.seqIdThr >= 0.7){
@@ -86,12 +99,11 @@ int clusteringworkflow(int argc, const char **argv, const Command& command) {
         }
     }
 
-    if (sensitivitySet == false) {
+    if (sensitivitySet == false && isNucleotideDb == false) {
         par.sensitivity = setAutomaticThreshold(par.seqIdThr);
         Debug(Debug::INFO) << "Set cluster sensitivity to -s " << par.sensitivity << "\n";
     }
 
-    const int dbType = FileUtil::parseDbType(par.db1.c_str());
     const bool isUngappedMode = par.alignmentMode == Parameters::ALIGNMENT_MODE_UNGAPPED;
     if (isUngappedMode && Parameters::isEqualDbtype(dbType, Parameters::DBTYPE_HMM_PROFILE)) {
         par.printUsageMessage(command, MMseqsParameter::COMMAND_ALIGN|MMseqsParameter::COMMAND_PREFILTER);
@@ -99,6 +111,7 @@ int clusteringworkflow(int argc, const char **argv, const Command& command) {
         EXIT(EXIT_FAILURE);
     }
 
+
     const bool nonSymetric = (par.covMode == Parameters::COV_MODE_TARGET ||par.covMode == Parameters::COV_MODE_QUERY);
     if (clusterModeSet == false) {
         if (nonSymetric) {
@@ -114,7 +127,7 @@ int clusteringworkflow(int argc, const char **argv, const Command& command) {
                               << " in combination with coverage mode " << par.covMode << " can produce wrong results.\n"
                               << "Please use --cov-mode 2\n";
     }
-    if (par.cascaded == true && par.clusteringMode == Parameters::CONNECTED_COMPONENT) {
+    if (par.singleStepClustering == false && par.clusteringMode == Parameters::CONNECTED_COMPONENT) {
         Debug(Debug::WARNING) << "Connected component clustering produces less clusters in a single step clustering.\n"
                               << "Please use --single-step-cluster";
     }
@@ -141,12 +154,43 @@ int clusteringworkflow(int argc, const char **argv, const Command& command) {
     par.rescoreMode = originalRescoreMode;
     cmd.addVariable("RUNNER", par.runner.c_str());
     cmd.addVariable("MERGECLU_PAR", par.createParameterString(par.threadsandcompression).c_str());
+    cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str());
+
+    if(isNucleotideDb){
+        par.forwardFrames= "1";
+        par.reverseFrames= "1";
+        par.searchType = 3;
+        cmd.addVariable("EXTRACT_FRAMES_PAR", par.createParameterString(par.extractframes).c_str());
+        int oldKmer = par.kmerSize;
+        par.kmerSize = 0;
+        cmd.addVariable("LINCLUST_PAR", par.createParameterString(par.linclustworkflow).c_str());
+        par.kmerSize = oldKmer;
+        if (par.PARAM_MAX_SEQS.wasSet == false) {
+            par.maxResListLen = 300;
+        }
 
-    if (par.cascaded) {
+        cmd.addVariable("PREFILTER_PAR", par.createParameterString(par.prefilter).c_str());
+        if (isUngappedMode) {
+            par.rescoreMode = Parameters::RESCORE_MODE_ALIGNMENT;
+            cmd.addVariable("ALIGNMENT_PAR", par.createParameterString(par.rescorediagonal).c_str());
+            par.rescoreMode = originalRescoreMode;
+        } else {
+            cmd.addVariable("ALIGNMENT_MODE_NOT_SET","TRUE");
+            par.rescoreMode = Parameters::RESCORE_MODE_ALIGNMENT;
+            cmd.addVariable("RESCORE_ALN_PAR", par.createParameterString(par.rescorediagonal).c_str());
+            cmd.addVariable("THREADSANDCOMPRESS_PAR", par.createParameterString(par.threadsandcompression).c_str());
+            cmd.addVariable("ALIGNMENT_PAR", par.createParameterString(par.align).c_str());
+        }
+        cmd.addVariable("CLUSTER_PAR",   par.createParameterString(par.clust).c_str());
+        cmd.addVariable("OFFSETALIGNMENT_PAR", par.createParameterString(par.offsetalignment).c_str());
+        std::string program = tmpDir + "/nucleotide_clustering.sh";
+        FileUtil::writeFile(program, nucleotide_clustering_sh, nucleotide_clustering_sh_len);
+        cmd.execProgram(program.c_str(), par.filenames);
+    } else if (par.singleStepClustering == false) {
         // save some values to restore them later
         float targetSensitivity = par.sensitivity;
-        int alphabetSize = par.alphabetSize;
-        par.alphabetSize = Parameters::CLUST_LINEAR_DEFAULT_ALPH_SIZE;
+        MultiParam<int> alphabetSize = par.alphabetSize;
+        par.alphabetSize = MultiParam<int>(Parameters::CLUST_LINEAR_DEFAULT_ALPH_SIZE, 5);
         int kmerSize = par.kmerSize;
         par.kmerSize = Parameters::CLUST_LINEAR_DEFAULT_K;
         int maskMode = par.maskMode;
@@ -194,8 +238,6 @@ int clusteringworkflow(int argc, const char **argv, const Command& command) {
         }
         cmd.addVariable("THREADSANDCOMPRESS", par.createParameterString(par.threadsandcompression).c_str());
         cmd.addVariable("VERBCOMPRESS", par.createParameterString(par.verbandcompression).c_str());
-        cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str());
-
         cmd.addVariable("ALIGNMENT_REASSIGN_PAR", par.createParameterString(par.align).c_str());
 
         std::string program = tmpDir + "/cascaded_clustering.sh";
@@ -203,11 +245,13 @@ int clusteringworkflow(int argc, const char **argv, const Command& command) {
         cmd.execProgram(program.c_str(), par.filenames);
     } else {
         // same as above, clusthash needs a smaller alphabetsize
-        size_t alphabetSize = par.alphabetSize;
-        par.alphabetSize = Parameters::CLUST_HASH_DEFAULT_ALPH_SIZE;
+        MultiParam<int> alphabetSize = par.alphabetSize;
+        par.alphabetSize = MultiParam<int> (Parameters::CLUST_HASH_DEFAULT_ALPH_SIZE, 5);
+        float seqIdThr = par.seqIdThr;
+        par.seqIdThr = (float)Parameters::CLUST_HASH_DEFAULT_MIN_SEQ_ID/100.0f;
         cmd.addVariable("DETECTREDUNDANCY_PAR", par.createParameterString(par.clusthash).c_str());
         par.alphabetSize = alphabetSize;
-
+        par.seqIdThr = seqIdThr;
         cmd.addVariable("PREFILTER_PAR", par.createParameterString(par.prefilter).c_str());
         if (isUngappedMode) {
             cmd.addVariable("ALIGNMENT_PAR", par.createParameterString(par.rescorediagonal).c_str());
diff --git a/src/workflow/ClusterUpdate.cpp b/src/workflow/ClusterUpdate.cpp
index e3be977..ac1308e 100644
--- a/src/workflow/ClusterUpdate.cpp
+++ b/src/workflow/ClusterUpdate.cpp
@@ -10,6 +10,32 @@
 
 int clusterupdate(int argc, const char **argv, const Command& command) {
     Parameters& par = Parameters::getInstance();
+    par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_ALT_ALIGNMENT.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_KMER_PER_SEQ_SCALE.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_KMER_PER_SEQ.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_START_SENS.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_SENS_STEPS.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_CLUSTER_REASSIGN.addCategory(MMseqsParameter::COMMAND_EXPERT);
+
+    par.PARAM_INCLUDE_ONLY_EXTENDABLE.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_NUM_ITERATIONS.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    for (size_t i = 0; i < par.extractorfs.size(); i++) {
+        par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
+    }
+    for (size_t i = 0; i < par.translatenucs.size(); i++) {
+        par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
+    }
+    for (size_t i = 0; i < par.result2profile.size(); i++){
+        par.result2profile[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
+    }
+    par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+
     par.parseParameters(argc, argv, command, true, 0, 0);
 
     CommandCaller cmd;
@@ -26,7 +52,7 @@ int clusterupdate(int argc, const char **argv, const Command& command) {
     par.maxAccept = maxAccept;
 
     cmd.addVariable("CLUST_PAR", par.createParameterString(par.clusterworkflow).c_str());
-    
+
     std::string tmpDir = par.db6;
     std::string hash = SSTR(par.hashParameter(par.filenames, par.clusterUpdate));
     if (par.reuseLatest) {
diff --git a/src/workflow/CreateIndex.cpp b/src/workflow/CreateIndex.cpp
index c05dcd7..63a5b01 100644
--- a/src/workflow/CreateIndex.cpp
+++ b/src/workflow/CreateIndex.cpp
@@ -68,8 +68,23 @@ int createlinindex(int argc, const char **argv, const Command& command) {
     par.orfMaxLength = 32734;
     par.kmerScore = 0; // extract all k-mers
     par.maskMode = 0;
+    par.spacedKmer = false;
     // VTML has a slightly lower sensitivity in the regression test
-    par.seedScoringMatrixFile = ScoreMatrixFile("blosum62.out", "nucleotide.out");
+    par.seedScoringMatrixFile = MultiParam<char*>("blosum62.out", "nucleotide.out");
+
+    par.PARAM_COV_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_C.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_MIN_SEQ_ID.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    for (size_t i = 0; i < par.extractorfs.size(); i++) {
+        par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
+    }
+    for (size_t i = 0; i < par.translatenucs.size(); i++) {
+        par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
+    }
+    par.PARAM_COMPRESSED.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+
     par.parseParameters(argc, argv, command, true, 0, 0);
     int dbType = FileUtil::parseDbType(par.db1.c_str());
     bool isNucl = Parameters::isEqualDbtype(dbType, Parameters::DBTYPE_NUCLEOTIDES);
@@ -96,16 +111,33 @@ int createindex(int argc, const char **argv, const Command& command) {
     par.kmerScore = 0; // extract all k-mers
     par.sensitivity = 7.5;
     par.maskMode = 1;
+
+    par.PARAM_COV_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_C.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_MIN_SEQ_ID.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_SPLIT.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    for (size_t i = 0; i < par.splitsequence.size(); i++) {
+        par.splitsequence[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
+    }
+    for (size_t i = 0; i < par.extractorfs.size(); i++) {
+        par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
+    }
+    for (size_t i = 0; i < par.translatenucs.size(); i++) {
+        par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
+    }
+    par.PARAM_COMPRESSED.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+
     par.parseParameters(argc, argv, command, true, 0, 0);
 
     int dbType = FileUtil::parseDbType(par.db1.c_str());
     bool isNucl = Parameters::isEqualDbtype(dbType, Parameters::DBTYPE_NUCLEOTIDES);
 
-    if(par.PARAM_STRAND.wasSet == false){
+    if (par.PARAM_STRAND.wasSet == false) {
         par.strand = 1;
     }
-    par.overrideParameterDescription((Command &) command, par.PARAM_MASK_RESIDUES.uniqid, "0: w/o low complexity masking, 1: with low complexity masking, 2: add both masked and unmasked sequences to index", "^[0-2]{1}", par.PARAM_MASK_RESIDUES.category);
-
     if(isNucl && par.searchType == Parameters::SEARCH_TYPE_NUCLEOTIDES ){
         if ( par.PARAM_K.wasSet == false) {
             par.kmerSize = 15;
diff --git a/src/workflow/Databases.cpp b/src/workflow/Databases.cpp
new file mode 100644
index 0000000..52a4d91
--- /dev/null
+++ b/src/workflow/Databases.cpp
@@ -0,0 +1,256 @@
+#include "Util.h"
+#include "Parameters.h"
+#include "Debug.h"
+#include "FileUtil.h"
+#include "CommandCaller.h"
+
+#include <cassert>
+
+#include "databases.sh.h"
+
+struct EnvironmentEntry {
+    const char* key;
+    const char* value;
+};
+
+struct DatabaseDownload {
+    const char *name;
+    const char *description;
+    const char *citation;
+    const char *url;
+    bool hasTaxonomy;
+    int dbType;
+    const unsigned char *script;
+    size_t scriptLength;
+    std::vector<EnvironmentEntry> environment;
+};
+
+std::vector<DatabaseDownload> downloads = {{
+   "UniRef100",
+   "The UniProt Reference Clusters provide clustered sets of sequences from the UniProt Knowledgebase.",
+   "Suzek et al: UniRef: comprehensive and non-redundant UniProt reference clusters. Bioinformatics 23(10), 1282–1288 (2007)",
+   "https://www.uniprot.org/help/uniref",
+   true, Parameters::DBTYPE_AMINO_ACIDS, databases_sh, databases_sh_len,
+   { }
+}, {
+    "UniRef90",
+    "The UniProt Reference Clusters provide clustered sets of sequences from the UniProt Knowledgebase.",
+    "Suzek et al: UniRef: comprehensive and non-redundant UniProt reference clusters. Bioinformatics 23(10), 1282–1288 (2007)",
+    "https://www.uniprot.org/help/uniref",
+    true, Parameters::DBTYPE_AMINO_ACIDS, databases_sh, databases_sh_len,
+    { }
+}, {
+    "UniRef50",
+    "The UniProt Reference Clusters provide clustered sets of sequences from the UniProt Knowledgebase.",
+    "Suzek et al: UniRef: comprehensive and non-redundant UniProt reference clusters. Bioinformatics 23(10), 1282–1288 (2007)",
+    "https://www.uniprot.org/help/uniref",
+    true, Parameters::DBTYPE_AMINO_ACIDS, databases_sh, databases_sh_len,
+    { }
+}, {
+    "UniProtKB",
+    "The UniProt Knowledgebase is the central hub for the collection of functional information on proteins, with accurate, consistent and rich annotation.",
+    "The UniProt Consortium: UniProt: a worldwide hub of protein knowledge. Nucleic Acids Res 47(D1), D506-515 (2019)",
+    "https://www.uniprot.org/help/uniprotkb",
+    true, Parameters::DBTYPE_AMINO_ACIDS, databases_sh, databases_sh_len,
+    { }
+}, {
+    "UniProtKB/TrEMBL",
+    "UniProtKB/TrEMBL (unreviewed) contains protein sequences associated with computationally generated annotation and large-scale functional characterization.",
+    "The UniProt Consortium: UniProt: a worldwide hub of protein knowledge. Nucleic Acids Res 47(D1), D506-515 (2019)",
+    "https://www.uniprot.org/help/uniprotkb",
+    true, Parameters::DBTYPE_AMINO_ACIDS, databases_sh, databases_sh_len,
+    { }
+}, {
+    "UniProtKB/Swiss-Prot",
+    "UniProtKB/Swiss-Prot (reviewed) is a high quality manually annotated and non-redundant protein sequence database, which brings together experimental results, computed features and scientific conclusions.",
+    "The UniProt Consortium: UniProt: a worldwide hub of protein knowledge. Nucleic Acids Res 47(D1), D506-515 (2019)",
+    "https://uniprot.org",
+    true, Parameters::DBTYPE_AMINO_ACIDS, databases_sh, databases_sh_len,
+    { }
+}, {
+    "NR",
+    "Non-redundant protein sequences from GenPept, Swissprot, PIR, PDF, PDB, and NCBI RefSeq.",
+    "NCBI Resource Coordinators: Database resources of the National Center for Biotechnology Information. Nucleic Acids Res 46(D1), D8-D13 (2018)",
+    "https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA",
+    false, Parameters::DBTYPE_AMINO_ACIDS, databases_sh, databases_sh_len,
+    { }
+}, {
+    "NT",
+    "Partially non-redundant nucleotide sequences from all traditional divisions of GenBank, EMBL, and DDBJ excluding GSS, STS, PAT, EST, HTG, and WGS.",
+    "NCBI Resource Coordinators: Database resources of the National Center for Biotechnology Information. Nucleic Acids Res 46(D1), D8-D13 (2018)",
+    "https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA",
+    false, Parameters::DBTYPE_NUCLEOTIDES, databases_sh, databases_sh_len,
+    { }
+}, {
+    "PDB",
+    "The Protein Data Bank is the single worldwide archive of structural data of biological macromolecules.",
+    "Berman et al: The Protein Data Bank. Nucleic Acids Res 28(1), 235-242 (2000)",
+    "https://www.rcsb.org",
+    false, Parameters::DBTYPE_AMINO_ACIDS, databases_sh, databases_sh_len,
+    { }
+}, {
+    "PDB70",
+    "PDB clustered to 70% sequence identity and enriched using HHblits with Uniclust sequences.",
+    "Steinegger et al: HH-suite3 for fast remote homology detection and deep protein annotation. BMC Bioinform 20(1), 473 (2019)",
+    "https://github.com/soedinglab/hh-suite",
+    false, Parameters::DBTYPE_HMM_PROFILE, databases_sh, databases_sh_len,
+    { }
+}, {
+    "Pfam-A.full",
+    "The Pfam database is a large collection of protein families, each represented by multiple sequence alignments and hidden Markov models.",
+    "El-Gebali and Mistry et al: The Pfam protein families database in 2019. Nucleic Acids Res 47(D1), D427-D432 (2019)",
+    "https://pfam.xfam.org",
+    false, Parameters::DBTYPE_HMM_PROFILE, databases_sh, databases_sh_len,
+    { }
+}, {
+    "Pfam-A.seed",
+    "The Pfam database is a large collection of protein families, each represented by multiple sequence alignments and hidden Markov models.",
+    "El-Gebali and Mistry et al: The Pfam protein families database in 2019. Nucleic Acids Res 47(D1), D427-D432 (2019)",
+    "https://pfam.xfam.org",
+    false, Parameters::DBTYPE_HMM_PROFILE, databases_sh, databases_sh_len,
+    { }
+}, {
+    "eggNOG",
+    "eggNOG is a hierarchical, functionally and phylogenetically annotated orthology resource",
+    "Huerta-Cepas et al: eggNOG 5.0: a hierarchical, functionally and phylogenetically annotated orthology resource based on 5090 organisms and 2502 viruses. Nucleic Acids Res 47(D1), D309–D314 (2019)",
+    "http://eggnog5.embl.de",
+    false, Parameters::DBTYPE_HMM_PROFILE, databases_sh, databases_sh_len,
+    { }
+}, {
+    "Resfinder",
+    "ResFinder is a database that captures antimicrobial resistance genes from whole-genome data sets.",
+    "Zankari et al: Identification of acquired antimicrobial resistance genes. J Antimicrob Chemother 67(11), 2640-2644 (2012)",
+    "https://cge.cbs.dtu.dk/services/ResFinder",
+    false, Parameters::DBTYPE_NUCLEOTIDES, databases_sh, databases_sh_len,
+    { }
+}, {
+    "Kalamari",
+    "Kalamari contains over 250 genomes chosen to be representative of agents tracked by genome-based foodborne disease surveillance, common contaminants, and diverse phyla and bacterial genera.",
+    "Katz et al: Kraken with Kalamari: Contamination Detection. ASM Poster, 270 (2018)",
+    "https://github.com/lskatz/Kalamari",
+    true, Parameters::DBTYPE_NUCLEOTIDES, databases_sh, databases_sh_len,
+    { }
+},
+};
+
+const int PAD_LEFT = 0;
+const int PAD_RIGHT = 1;
+void appendPadded(std::string& dst, const std::string& value, size_t n, int direction = PAD_LEFT, char padding = ' ') {
+    if (n < value.size()) {
+        dst.append(value);
+        return;
+    }
+    if (direction == PAD_RIGHT) {
+        dst.append(n - value.size(), padding);
+    }
+    dst.append(value);
+    if (direction == PAD_LEFT) {
+        dst.append(n - value.size(), padding);
+    }
+}
+
+std::string listDatabases(const Command &command, bool detailed) {
+    size_t nameWidth = 4, urlWidth = 3, dbTypeWidth = 4;
+    for (size_t i = 0; i < downloads.size(); ++i) {
+        nameWidth = std::max(nameWidth, strlen(downloads[i].name));
+        urlWidth = std::max(urlWidth, strlen(downloads[i].url));
+        dbTypeWidth = std::max(dbTypeWidth, strlen(Parameters::getDbTypeName(downloads[i].dbType)));
+    }
+
+    std::string description;
+    description.reserve(1024);
+    if (detailed) {
+        description += " By ";
+        description += command.author;
+        description += "\n";
+    }
+
+    description += "\n  ";
+    appendPadded(description, "Name", nameWidth);
+    description.append(1, '\t');
+    appendPadded(description, "Type", dbTypeWidth);
+    description.append(1, '\t');
+    appendPadded(description, "Taxonomy", 8);
+    description.append(1, '\t');
+    appendPadded(description, "Url", urlWidth);
+    description.append(1, '\n');
+
+    for (size_t i = 0; i < downloads.size(); ++i) {
+        description.append("- ");
+        appendPadded(description, downloads[i].name, nameWidth);
+        description.append(1, '\t');
+        appendPadded(description, Parameters::getDbTypeName(downloads[i].dbType), dbTypeWidth);
+        description.append(1, '\t');
+        appendPadded(description, (downloads[i].hasTaxonomy ? "yes" : "-"), 8, PAD_RIGHT);
+        description.append(1, '\t');
+        appendPadded(description, downloads[i].url, urlWidth);
+        description.append(1, '\n');
+        if (detailed) {
+            if (strlen(downloads[i].description) > 0) {
+                description.append(2, ' ');
+                description.append(downloads[i].description);
+                description.append(1, '\n');
+            }
+            if (strlen(downloads[i].citation) > 0) {
+                description.append("  Cite: ");
+                description.append(downloads[i].citation);
+                description.append(1, '\n');
+            }
+        }
+    }
+
+    return description;
+}
+
+int databases(int argc, const char **argv, const Command &command) {
+    Parameters &par = Parameters::getInstance();
+    par.parseParameters(argc, argv, command, false, Parameters::PARSE_ALLOW_EMPTY, 0);
+
+    std::string description = listDatabases(command, par.help);
+    if (par.filenames.size() == 0 || par.help) {
+        par.printUsageMessage(command, par.help ? MMseqsParameter::COMMAND_EXPERT : 0, description.c_str());
+        EXIT(EXIT_SUCCESS);
+    }
+
+    ssize_t downloadIdx = -1;
+    for (size_t i = 0; i < downloads.size(); ++i) {
+        if (par.db1 == std::string(downloads[i].name)) {
+            downloadIdx = i;
+            break;
+        }
+    }
+    if (downloadIdx == -1) {
+        par.printUsageMessage(command, par.help ? MMseqsParameter::COMMAND_EXPERT : 0, description.c_str());
+        Debug(Debug::ERROR) << "Selected database " << par.db1 << " was not found\n";
+        EXIT(EXIT_FAILURE);
+    }
+    par.printParameters(command.cmd, argc, argv, par.databases);
+    std::string tmpDir = par.db3;
+    std::string hash = SSTR(par.hashParameter(par.filenames, par.databases));
+    if (par.reuseLatest) {
+        hash = FileUtil::getHashFromSymLink(tmpDir + "/latest");
+    }
+    tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash);
+    par.filenames.pop_back();
+    par.filenames.push_back(tmpDir);
+
+    CommandCaller cmd;
+    for (size_t i = 0; i < downloads[downloadIdx].environment.size(); ++i) {
+        cmd.addVariable(downloads[downloadIdx].environment[i].key, downloads[downloadIdx].environment[i].value);
+    }
+    cmd.addVariable("TAXONOMY", downloads[downloadIdx].hasTaxonomy ? "TRUE" : NULL);
+    cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL);
+    cmd.addVariable("VERB_PAR", par.createParameterString(par.onlyverbosity).c_str());
+    cmd.addVariable("COMP_PAR", par.createParameterString(par.verbandcompression).c_str());
+    // aria2c gives an (undocumented error with more than 16 connections)
+    cmd.addVariable("ARIA_NUM_CONN", SSTR(std::min(16, par.threads)).c_str());
+    cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str());
+    cmd.addVariable("THREADS_COMP_PAR", par.createParameterString(par.threadsandcompression).c_str());
+    std::string program = tmpDir + "/download.sh";
+    FileUtil::writeFile(program, downloads[downloadIdx].script, downloads[downloadIdx].scriptLength);
+    cmd.execProgram(program.c_str(), par.filenames);
+
+    // Should never get here
+    assert(false);
+    EXIT(EXIT_FAILURE);
+}
diff --git a/src/workflow/EasyCluster.cpp b/src/workflow/EasyCluster.cpp
index 5883f1d..afb35c1 100644
--- a/src/workflow/EasyCluster.cpp
+++ b/src/workflow/EasyCluster.cpp
@@ -29,21 +29,22 @@ void setEasyClusterMustPassAlong(Parameters *p) {
 
 int easycluster(int argc, const char **argv, const Command &command) {
     Parameters &par = Parameters::getInstance();
-    par.overrideParameterDescription((Command &)command, par.PARAM_ADD_BACKTRACE.uniqid, NULL, NULL, par.PARAM_ADD_BACKTRACE.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &)command, par.PARAM_ALT_ALIGNMENT.uniqid, NULL, NULL, par.PARAM_ALT_ALIGNMENT.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &)command, par.PARAM_RESCORE_MODE.uniqid, NULL, NULL, par.PARAM_RESCORE_MODE.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &)command, par.PARAM_MAX_REJECTED.uniqid, NULL, NULL, par.PARAM_MAX_REJECTED.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &)command, par.PARAM_MAX_ACCEPT.uniqid, NULL, NULL, par.PARAM_MAX_ACCEPT.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &)command, par.PARAM_KMER_PER_SEQ.uniqid, NULL, NULL, par.PARAM_KMER_PER_SEQ.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &)command, par.PARAM_S.uniqid, "Sensitivity will be automatically determined but can be adjusted", NULL, par.PARAM_S.category |MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &)command, par.PARAM_INCLUDE_ONLY_EXTENDABLE.uniqid, NULL, NULL, par.PARAM_INCLUDE_ONLY_EXTENDABLE.category | MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_MAX_SEQS.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_ALT_ALIGNMENT.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_KMER_PER_SEQ.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_S.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_INCLUDE_ONLY_EXTENDABLE.addCategory(MMseqsParameter::COMMAND_EXPERT);
     for (size_t i = 0; i < par.createdb.size(); i++){
-        par.overrideParameterDescription((Command &)command, par.createdb[i]->uniqid, NULL, NULL, par.createdb[i]->category | MMseqsParameter::COMMAND_EXPERT);
+        par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
-    par.overrideParameterDescription((Command &) command, par.PARAM_THREADS.uniqid, NULL, NULL,
-                                     par.PARAM_THREADS.category & ~MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &) command, par.PARAM_V.uniqid, NULL, NULL,
-                                     par.PARAM_V.category & ~MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT);
 
     setEasyClusterDefaults(&par);
     par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0);
diff --git a/src/workflow/EasyLinclust.cpp b/src/workflow/EasyLinclust.cpp
index 9571f5c..85dbfc3 100644
--- a/src/workflow/EasyLinclust.cpp
+++ b/src/workflow/EasyLinclust.cpp
@@ -11,7 +11,7 @@ namespace linclust {
 }
 
 void setEasyLinclustDefaults(Parameters *p) {
-    p->spacedKmer = true;
+    p->spacedKmer = false;
     p->removeTmpFiles = true;
     p->covThr = 0.8;
     p->evalThr = 0.001;
@@ -36,18 +36,20 @@ void setEasyLinclustMustPassAlong(Parameters *p) {
 
 int easylinclust(int argc, const char **argv, const Command &command) {
     Parameters &par = Parameters::getInstance();
-    par.overrideParameterDescription((Command &)command, par.PARAM_ADD_BACKTRACE.uniqid, NULL, NULL, par.PARAM_ADD_BACKTRACE.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &)command, par.PARAM_ALT_ALIGNMENT.uniqid, NULL, NULL, par.PARAM_ALT_ALIGNMENT.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &)command, par.PARAM_RESCORE_MODE.uniqid, NULL, NULL, par.PARAM_RESCORE_MODE.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &)command, par.PARAM_MAX_REJECTED.uniqid, NULL, NULL, par.PARAM_MAX_REJECTED.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &)command, par.PARAM_MAX_ACCEPT.uniqid, NULL, NULL, par.PARAM_MAX_ACCEPT.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &)command, par.PARAM_S.uniqid, "Sensitivity will be automatically determined but can be adjusted", NULL, par.PARAM_S.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &)command, par.PARAM_INCLUDE_ONLY_EXTENDABLE.uniqid, NULL, NULL, par.PARAM_INCLUDE_ONLY_EXTENDABLE.category | MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_ALT_ALIGNMENT.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.overrideParameterDescription(par.PARAM_S, "Sensitivity will be automatically determined but can be adjusted", NULL, par.PARAM_S.category | MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_INCLUDE_ONLY_EXTENDABLE.addCategory(MMseqsParameter::COMMAND_EXPERT);
     for (size_t i = 0; i < par.createdb.size(); i++){
-        par.overrideParameterDescription((Command &)command, par.createdb[i]->uniqid, NULL, NULL, par.createdb[i]->category | MMseqsParameter::COMMAND_EXPERT);
+        par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
-    par.overrideParameterDescription((Command &) command, par.PARAM_THREADS.uniqid, NULL, NULL, par.PARAM_THREADS.category & ~MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &) command, par.PARAM_V.uniqid, NULL, NULL, par.PARAM_V.category & ~MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT);
 
     setEasyLinclustDefaults(&par);
     par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0);
diff --git a/src/workflow/EasyRbh.cpp b/src/workflow/EasyRbh.cpp
new file mode 100644
index 0000000..663acac
--- /dev/null
+++ b/src/workflow/EasyRbh.cpp
@@ -0,0 +1,106 @@
+#include <cassert>
+#include "LinsearchIndexReader.h"
+#include "PrefilteringIndexReader.h"
+#include "FileUtil.h"
+#include "CommandCaller.h"
+#include "Util.h"
+#include "Debug.h"
+#include "Parameters.h"
+#include "easyrbh.sh.h"
+
+int easyrbh(int argc, const char **argv, const Command &command) {
+    Parameters &par = Parameters::getInstance();
+    par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_DB_OUTPUT.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_OVERLAP.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_DB_OUTPUT.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    for (size_t i = 0; i < par.createdb.size(); i++){
+        par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
+    }
+    for (size_t i = 0; i < par.extractorfs.size(); i++){
+        par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
+    }
+    for (size_t i = 0; i < par.translatenucs.size(); i++){
+        par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
+    }
+    for (size_t i = 0; i < par.result2profile.size(); i++){
+        par.result2profile[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
+    }
+    par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+
+    par.sensitivity = 5.7;
+    par.removeTmpFiles = true;
+    par.alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID;
+    par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0);
+    par.PARAM_S.wasSet = true;
+    par.PARAM_REMOVE_TMP_FILES.wasSet = true;
+    par.PARAM_ALIGNMENT_MODE.wasSet = true;
+
+    bool needBacktrace = false;
+    bool needTaxonomy = false;
+    bool needTaxonomyMapping = false;
+    {
+        bool needSequenceDB = false;
+        bool needFullHeaders = false;
+        bool needLookup = false;
+        bool needSource = false;
+        Parameters::getOutputFormat(par.outfmt, needSequenceDB, needBacktrace, needFullHeaders,
+                needLookup, needSource, needTaxonomyMapping, needTaxonomy);
+    }
+
+    if (par.formatAlignmentMode == Parameters::FORMAT_ALIGNMENT_SAM || par.greedyBestHits) {
+        needBacktrace = true;
+    }
+    if (needBacktrace) {
+        Debug(Debug::INFO) << "Alignment backtraces will be computed, since they were requested by output format.\n";
+        par.addBacktrace = true;
+        par.PARAM_ADD_BACKTRACE.wasSet = true;
+    }
+
+    std::string tmpDir = par.filenames.back();
+    std::string hash = SSTR(par.hashParameter(par.filenames, *command.params));
+    if (par.reuseLatest) {
+        hash = FileUtil::getHashFromSymLink(tmpDir + "/latest");
+    }
+    tmpDir = FileUtil::createTemporaryDirectory(tmpDir, hash);
+    par.filenames.pop_back();
+
+    CommandCaller cmd;
+    cmd.addVariable("TMP_PATH", tmpDir.c_str());
+    cmd.addVariable("RESULTS", par.filenames.back().c_str());
+    par.filenames.pop_back();
+    std::string target = par.filenames.back().c_str();
+    cmd.addVariable("TARGET", target.c_str());
+    par.filenames.pop_back();
+    if(needTaxonomy || needTaxonomyMapping){
+        Parameters::checkIfTaxDbIsComplete(target);
+    }
+
+    cmd.addVariable("QUERY", par.filenames.back().c_str());
+
+    cmd.addVariable("SEARCH_PAR", par.createParameterString(par.searchworkflow, true).c_str());
+    cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL);
+    cmd.addVariable("LEAVE_INPUT", par.dbOut ? "TRUE" : NULL);
+
+    cmd.addVariable("RUNNER", par.runner.c_str());
+    cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str());
+
+    cmd.addVariable("CREATEDB_QUERY_PAR", par.createParameterString(par.createdb).c_str());
+    par.createdbMode = Parameters::SEQUENCE_SPLIT_MODE_HARD;
+    cmd.addVariable("CREATEDB_PAR", par.createParameterString(par.createdb).c_str());
+    cmd.addVariable("CONVERT_PAR", par.createParameterString(par.convertalignments).c_str());
+
+    std::string program = tmpDir + "/easyrbh.sh";
+    FileUtil::writeFile(program, easyrbh_sh, easyrbh_sh_len);
+    cmd.execProgram(program.c_str(), par.filenames);
+
+    // Should never get here
+    assert(false);
+    return EXIT_FAILURE;
+}
+
diff --git a/src/workflow/EasySearch.cpp b/src/workflow/EasySearch.cpp
index b36437f..f7aa4ad 100644
--- a/src/workflow/EasySearch.cpp
+++ b/src/workflow/EasySearch.cpp
@@ -27,26 +27,28 @@ void setEasySearchMustPassAlong(Parameters *p, bool linsearch) {
 
 int doeasysearch(int argc, const char **argv, const Command &command, bool linsearch) {
     Parameters &par = Parameters::getInstance();
-    par.overrideParameterDescription((Command &) command, par.PARAM_ADD_BACKTRACE.uniqid, NULL, NULL, par.PARAM_ADD_BACKTRACE.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &) command, par.PARAM_MAX_REJECTED.uniqid, NULL, NULL, par.PARAM_MAX_REJECTED.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &) command, par.PARAM_DB_OUTPUT.uniqid, NULL, NULL, par.PARAM_DB_OUTPUT.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &) command, par.PARAM_OVERLAP.uniqid, NULL, NULL, par.PARAM_OVERLAP.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &) command, par.PARAM_DB_OUTPUT.uniqid, NULL, NULL, par.PARAM_DB_OUTPUT.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &) command, par.PARAM_RESCORE_MODE.uniqid, NULL, NULL, par.PARAM_RESCORE_MODE.category | MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_DB_OUTPUT.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_OVERLAP.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_DB_OUTPUT.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
     for (size_t i = 0; i < par.createdb.size(); i++){
-        par.overrideParameterDescription((Command &)command, par.createdb[i]->uniqid, NULL, NULL, par.createdb[i]->category | MMseqsParameter::COMMAND_EXPERT);
+        par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
     for (size_t i = 0; i < par.extractorfs.size(); i++){
-        par.overrideParameterDescription((Command &)command, par.extractorfs[i]->uniqid, NULL, NULL, par.extractorfs[i]->category | MMseqsParameter::COMMAND_EXPERT);
+        par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
     for (size_t i = 0; i < par.translatenucs.size(); i++){
-        par.overrideParameterDescription((Command &)command, par.translatenucs[i]->uniqid, NULL, NULL, par.translatenucs[i]->category | MMseqsParameter::COMMAND_EXPERT);
+        par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
     for (size_t i = 0; i < par.result2profile.size(); i++){
-        par.overrideParameterDescription((Command &)command, par.result2profile[i]->uniqid, NULL, NULL, par.result2profile[i]->category | MMseqsParameter::COMMAND_EXPERT);
+        par.result2profile[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
-    par.overrideParameterDescription((Command &) command, par.PARAM_THREADS.uniqid, NULL, NULL, par.PARAM_THREADS.category & ~MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &) command, par.PARAM_V.uniqid, NULL, NULL, par.PARAM_V.category & ~MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT);
 
     setEasySearchDefaults(&par, linsearch);
     par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0);
@@ -115,6 +117,7 @@ int doeasysearch(int argc, const char **argv, const Command &command, bool linse
     cmd.addVariable("LEAVE_INPUT", par.dbOut ? "TRUE" : NULL);
 
     cmd.addVariable("RUNNER", par.runner.c_str());
+    cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str());
 
     cmd.addVariable("CREATEDB_QUERY_PAR", par.createParameterString(par.createdb).c_str());
     par.createdbMode = Parameters::SEQUENCE_SPLIT_MODE_HARD;
diff --git a/src/workflow/EasyTaxonomy.cpp b/src/workflow/EasyTaxonomy.cpp
index d4e924e..d21a83f 100644
--- a/src/workflow/EasyTaxonomy.cpp
+++ b/src/workflow/EasyTaxonomy.cpp
@@ -29,6 +29,37 @@ void setEasyTaxonomyMustPassAlong(Parameters *p) {
 
 int easytaxonomy(int argc, const char **argv, const Command& command) {
     Parameters& par = Parameters::getInstance();
+
+    par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_DB_OUTPUT.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_OVERLAP.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_DB_OUTPUT.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_NUM_ITERATIONS.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_PICK_ID_FROM.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    for (size_t i = 0; i < par.createdb.size(); i++){
+        par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
+    }
+    for (size_t i = 0; i < par.extractorfs.size(); i++){
+        par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
+    }
+    for (size_t i = 0; i < par.translatenucs.size(); i++){
+        par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
+    }
+    for (size_t i = 0; i < par.result2profile.size(); i++){
+        par.result2profile[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
+    }
+    for (size_t i = 0; i < par.convertalignments.size(); i++){
+        par.convertalignments[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
+    }
+    for (size_t i = 0; i < par.createtsv.size(); i++){
+        par.createtsv[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
+    }
+    par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+
     setEasyTaxonomyDefaults(&par);
     par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0);
     setEasyTaxonomyMustPassAlong(&par);
@@ -49,6 +80,7 @@ int easytaxonomy(int argc, const char **argv, const Command& command) {
     cmd.addVariable("TMP_PATH", tmpDir.c_str());
     cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL);
     cmd.addVariable("RUNNER", par.runner.c_str());
+    cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str());
 
     int alignmentMode = par.alignmentMode;
     if (par.taxonomySearchMode == Parameters::TAXONOMY_2BLCA) {
diff --git a/src/workflow/Linclust.cpp b/src/workflow/Linclust.cpp
index d9ec2c3..8eb72a0 100644
--- a/src/workflow/Linclust.cpp
+++ b/src/workflow/Linclust.cpp
@@ -10,7 +10,7 @@
 #include <cassert>
 
 void setLinclustWorkflowDefaults(Parameters *p) {
-    p->spacedKmer = true;
+    p->spacedKmer = false;
     p->covThr = 0.8;
     p->maskMode = 0;
     p->evalThr = 0.001;
@@ -21,9 +21,14 @@ void setLinclustWorkflowDefaults(Parameters *p) {
 int linclust(int argc, const char **argv, const Command& command) {
     Parameters& par = Parameters::getInstance();
     setLinclustWorkflowDefaults(&par);
-    par.overrideParameterDescription((Command &)command, par.PARAM_RESCORE_MODE.uniqid, NULL, NULL, par.PARAM_RESCORE_MODE.category |MMseqsParameter::COMMAND_EXPERT );
-    par.overrideParameterDescription((Command &)command, par.PARAM_MAX_REJECTED.uniqid, NULL, NULL, par.PARAM_MAX_REJECTED.category |MMseqsParameter::COMMAND_EXPERT );
-    par.overrideParameterDescription((Command &)command, par.PARAM_MAX_ACCEPT.uniqid, NULL, NULL, par.PARAM_MAX_ACCEPT.category |MMseqsParameter::COMMAND_EXPERT );
+    par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_ALT_ALIGNMENT.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_ZDROP.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_MAX_ACCEPT.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.overrideParameterDescription(par.PARAM_S, "Sensitivity will be automatically determined but can be adjusted", NULL, par.PARAM_S.category | MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_INCLUDE_ONLY_EXTENDABLE.addCategory(MMseqsParameter::COMMAND_EXPERT);
 
     par.parseParameters(argc, argv, command, true, 0, 0);
 
@@ -41,7 +46,7 @@ int linclust(int argc, const char **argv, const Command& command) {
     cmd.addVariable("RUNNER", par.runner.c_str());
 
     // save some values to restore them later
-    size_t alphabetSize = par.alphabetSize;
+    MultiParam<int>alphabetSize = par.alphabetSize;
     size_t kmerSize = par.kmerSize;
     // # 1. Finding exact $k$-mer matches.
     bool kmerSizeWasSet = false;
@@ -74,7 +79,7 @@ int linclust(int argc, const char **argv, const Command& command) {
         par.kmerSize = Parameters::CLUST_LINEAR_DEFAULT_K;
     }
     if (alphabetSizeWasSet == false) {
-        par.alphabetSize = Parameters::CLUST_LINEAR_DEFAULT_ALPH_SIZE;
+        par.alphabetSize = MultiParam<int>(Parameters::CLUST_LINEAR_DEFAULT_ALPH_SIZE, 5);
     }
 
     const int dbType = FileUtil::parseDbType(par.db1.c_str());
@@ -90,6 +95,8 @@ int linclust(int argc, const char **argv, const Command& command) {
     cmd.addVariable("FILTER", Parameters::isEqualDbtype(dbType, Parameters::DBTYPE_AMINO_ACIDS) ? "1" : NULL);
     cmd.addVariable("KMERMATCHER_PAR", par.createParameterString(par.kmermatcher).c_str());
     cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str());
+    cmd.addVariable("VERBOSITYANDCOMPRESS", par.createParameterString(par.threadsandcompression).c_str());
+
     par.alphabetSize = alphabetSize;
     par.kmerSize = kmerSize;
 
diff --git a/src/workflow/Linsearch.cpp b/src/workflow/Linsearch.cpp
index d42302b..5dcca65 100644
--- a/src/workflow/Linsearch.cpp
+++ b/src/workflow/Linsearch.cpp
@@ -16,7 +16,7 @@ namespace Linsearch {
 #include <cassert>
 
 void setLinsearchDefaults(Parameters *p) {
-    p->spacedKmer = true;
+    p->spacedKmer = false;
     p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV;
     p->sensitivity = 5.7;
     p->evalThr = 0.001;
@@ -27,38 +27,31 @@ void setLinsearchDefaults(Parameters *p) {
     p->evalProfile = 0.1;
 
     // VTML has a slightly lower sensitivity in the regression test
-    p->seedScoringMatrixFile = ScoreMatrixFile("blosum62.out", "nucleotide.out");
+    p->seedScoringMatrixFile = MultiParam<char*>("blosum62.out", "nucleotide.out");
 }
 
 
 int linsearch(int argc, const char **argv, const Command &command) {
     Parameters &par = Parameters::getInstance();
     setLinsearchDefaults(&par);
-    par.overrideParameterDescription((Command &) command, par.PARAM_COV_MODE.uniqid, NULL, NULL,
-                                     par.PARAM_COV_MODE.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &) command, par.PARAM_C.uniqid, NULL, NULL,
-                                     par.PARAM_C.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &) command, par.PARAM_MIN_SEQ_ID.uniqid, NULL, NULL,
-                                     par.PARAM_MIN_SEQ_ID.category | MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_COV_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_C.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_MIN_SEQ_ID.addCategory(MMseqsParameter::COMMAND_EXPERT);
     for (size_t i = 0; i < par.extractorfs.size(); i++) {
-        par.overrideParameterDescription((Command &) command, par.extractorfs[i]->uniqid, NULL, NULL,
-                                         par.extractorfs[i]->category | MMseqsParameter::COMMAND_EXPERT);
+        par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
     for (size_t i = 0; i < par.translatenucs.size(); i++) {
-        par.overrideParameterDescription((Command &) command, par.translatenucs[i]->uniqid, NULL, NULL,
-                                         par.translatenucs[i]->category | MMseqsParameter::COMMAND_EXPERT);
+        par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
-    par.overrideParameterDescription((Command &) command, par.PARAM_THREADS.uniqid, NULL, NULL,
-                                     par.PARAM_THREADS.category & ~MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &) command, par.PARAM_V.uniqid, NULL, NULL,
-                                     par.PARAM_V.category & ~MMseqsParameter::COMMAND_EXPERT);
-    par.parseParameters(argc, argv, command, true, 0,
-                        MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_PREFILTER);
+    par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT);
 
+    par.parseParameters(argc, argv, command, true, 0, MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_PREFILTER);
 
     const int queryDbType = FileUtil::parseDbType(par.db1.c_str());
     std::string indexStr = LinsearchIndexReader::searchForIndex(par.db2);
-    if (indexStr.size() == 0) {
+    if (indexStr.empty()) {
         Debug(Debug::ERROR) << par.db2 << " needs to be index.\n";
         Debug(Debug::ERROR) << "createlinindex " << par.db2 << ".\n";
         EXIT(EXIT_FAILURE);
diff --git a/src/workflow/Map.cpp b/src/workflow/Map.cpp
index f989774..23bdea5 100644
--- a/src/workflow/Map.cpp
+++ b/src/workflow/Map.cpp
@@ -27,21 +27,17 @@ int map(int argc, const char **argv, const Command &command) {
     Parameters &par = Parameters::getInstance();
     setMapWorkflowDefaults(&par);
 
-    par.overrideParameterDescription((Command &) command, par.PARAM_OVERLAP.uniqid, NULL, NULL,
-                                     par.PARAM_OVERLAP.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &) command, par.PARAM_DB_OUTPUT.uniqid, NULL, NULL,
-                                     par.PARAM_DB_OUTPUT.category | MMseqsParameter::COMMAND_EXPERT);
-
+    par.PARAM_OVERLAP.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_DB_OUTPUT.addCategory(MMseqsParameter::COMMAND_EXPERT);
     for (size_t i = 0; i < par.extractorfs.size(); i++){
-        par.overrideParameterDescription((Command &)command, par.extractorfs[i]->uniqid, NULL, NULL, par.extractorfs[i]->category | MMseqsParameter::COMMAND_EXPERT);
+        par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
     for (size_t i = 0; i < par.translatenucs.size(); i++){
-        par.overrideParameterDescription((Command &)command, par.translatenucs[i]->uniqid, NULL, NULL, par.translatenucs[i]->category | MMseqsParameter::COMMAND_EXPERT);
+        par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
-    par.overrideParameterDescription((Command &) command, par.PARAM_V.uniqid, NULL, NULL,
-                                     par.PARAM_V.category & ~MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &) command, par.PARAM_THREADS.uniqid, NULL, NULL,
-                                     par.PARAM_THREADS.category & ~MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT);
 
     par.parseParameters(argc, argv, command, true, 0, 0);
 
diff --git a/src/workflow/Rbh.cpp b/src/workflow/Rbh.cpp
index 277cffc..5be1306 100644
--- a/src/workflow/Rbh.cpp
+++ b/src/workflow/Rbh.cpp
@@ -9,6 +9,7 @@
 
 void setRbhDefaults(Parameters *p) {
     p->compBiasCorrection = 0;
+    p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID;
     p->maskMode = 0;
     p->orfStartMode = 1;
     p->orfMinLength = 10;
@@ -20,22 +21,19 @@ int rbh(int argc, const char **argv, const Command &command) {
     setRbhDefaults(&par);
 
     // set a lot of possibly misleading comments to EXPERT mode
-    par.overrideParameterDescription((Command &) command, par.PARAM_OVERLAP.uniqid, NULL, NULL,
-                                     par.PARAM_OVERLAP.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &) command, par.PARAM_DB_OUTPUT.uniqid, NULL, NULL,
-                                     par.PARAM_DB_OUTPUT.category | MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_OVERLAP.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_DB_OUTPUT.addCategory(MMseqsParameter::COMMAND_EXPERT);
 
     for (size_t i = 0; i < par.extractorfs.size(); i++){
-        par.overrideParameterDescription((Command &)command, par.extractorfs[i]->uniqid, NULL, NULL, par.extractorfs[i]->category | MMseqsParameter::COMMAND_EXPERT);
+        par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
     for (size_t i = 0; i < par.translatenucs.size(); i++){
-        par.overrideParameterDescription((Command &)command, par.translatenucs[i]->uniqid, NULL, NULL, par.translatenucs[i]->category | MMseqsParameter::COMMAND_EXPERT);
+        par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
     // restore threads and verbosity
-    par.overrideParameterDescription((Command &) command, par.PARAM_V.uniqid, NULL, NULL,
-                                     par.PARAM_V.category & ~MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &) command, par.PARAM_THREADS.uniqid, NULL, NULL,
-                                     par.PARAM_THREADS.category & ~MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT);
 
     par.parseParameters(argc, argv, command, true, 0, 0);
 
@@ -58,7 +56,7 @@ int rbh(int argc, const char **argv, const Command &command) {
     cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL);
     cmd.addVariable("VERB_COMP_PAR", par.createParameterString(par.verbandcompression).c_str());
     cmd.addVariable("THREADS_COMP_PAR", par.createParameterString(par.threadsandcompression).c_str());
-
+    cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str());
     std::string program = tmpDir + "/rbh.sh";
     FileUtil::writeFile(program, rbh_sh, rbh_sh_len);
     cmd.execProgram(program.c_str(), par.filenames);
diff --git a/src/workflow/Search.cpp b/src/workflow/Search.cpp
index 72a8dd1..6b83ee6 100644
--- a/src/workflow/Search.cpp
+++ b/src/workflow/Search.cpp
@@ -45,18 +45,18 @@ int computeSearchMode(int queryDbType, int targetDbType, int targetSrcDbType, in
             if(searchType == Parameters::SEARCH_TYPE_AUTO){
                 // WARNING because its not really an error, just a req. parameter
                 Debug(Debug::WARNING) << "It is unclear from the input if a translated or nucleotide search should be performed\n"
-                                         "Please provide the parameter --search-type 2 (translated) or 3 (nucleotide)\n";
+                                         "Please provide the parameter --search-type 2 (translated), 3 (nucleotide) or 4 (translated nucleotide backtrace)\n";
                 EXIT(EXIT_FAILURE);
             }
             // nucl/nucl
             // nucl/nucl translated
-            if(searchType == Parameters::SEARCH_TYPE_TRANSLATED){
+            if(searchType == Parameters::SEARCH_TYPE_TRANSLATED||searchType == Parameters::SEARCH_TYPE_TRANS_NUCL_ALN){
                 return Parameters::SEARCH_MODE_FLAG_QUERY_TRANSLATED| Parameters::SEARCH_MODE_FLAG_TARGET_TRANSLATED;
             }else if (searchType == Parameters::SEARCH_TYPE_NUCLEOTIDES ){
                 return Parameters::SEARCH_MODE_FLAG_QUERY_NUCLEOTIDE| Parameters::SEARCH_MODE_FLAG_TARGET_NUCLEOTIDE;
             } else {
                 Debug(Debug::ERROR) << "--search-type 1 (amino acid) can not used in combination with a nucleotide database\n "
-                                       "The only possible options --search-types 2 (translated) or 3 (nucleotide)\n";
+                                       "The only possible options --search-types 2 (translated), 3 (nucleotide) or 4 (translated nucleotide backtrace)\n";
                 EXIT(EXIT_FAILURE);
             }
         }
@@ -196,38 +196,26 @@ void setNuclSearchDefaults(Parameters *p) {
     if (  p->PARAM_MAX_SEQ_LEN.wasSet == false) {
         p->maxSeqLen = 10000;
     }
-    if( p->PARAM_GAP_OPEN.wasSet == false){
-        p->gapOpen = 5;
-    }
-    if( p->PARAM_GAP_EXTEND.wasSet  == false){
-        p->gapExtend = 2;
-    }
 }
 
 
 int search(int argc, const char **argv, const Command& command) {
     Parameters &par = Parameters::getInstance();
     setSearchDefaults(&par);
-    par.overrideParameterDescription((Command &) command, par.PARAM_COV_MODE.uniqid, NULL, NULL,
-                                     par.PARAM_COV_MODE.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &) command, par.PARAM_C.uniqid, NULL, NULL,
-                                     par.PARAM_C.category | MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &) command, par.PARAM_MIN_SEQ_ID.uniqid, NULL, NULL,
-                                     par.PARAM_MIN_SEQ_ID.category | MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_COV_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_C.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_MIN_SEQ_ID.addCategory(MMseqsParameter::COMMAND_EXPERT);
     for (size_t i = 0; i < par.extractorfs.size(); i++) {
-        par.overrideParameterDescription((Command &) command, par.extractorfs[i]->uniqid, NULL, NULL,
-                                         par.extractorfs[i]->category | MMseqsParameter::COMMAND_EXPERT);
+        par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
     for (size_t i = 0; i < par.translatenucs.size(); i++) {
-        par.overrideParameterDescription((Command &) command, par.translatenucs[i]->uniqid, NULL, NULL,
-                                         par.translatenucs[i]->category | MMseqsParameter::COMMAND_EXPERT);
+        par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
-    par.overrideParameterDescription((Command &) command, par.PARAM_THREADS.uniqid, NULL, NULL,
-                                     par.PARAM_THREADS.category & ~MMseqsParameter::COMMAND_EXPERT);
-    par.overrideParameterDescription((Command &) command, par.PARAM_V.uniqid, NULL, NULL,
-                                     par.PARAM_V.category & ~MMseqsParameter::COMMAND_EXPERT);
-    par.parseParameters(argc, argv, command, true, 0,
-                        MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_PREFILTER);
+    par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+
+    par.parseParameters(argc, argv, command, false, 0, MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_PREFILTER);
 
     std::string indexStr = PrefilteringIndexReader::searchForIndex(par.db2);
 
@@ -251,11 +239,10 @@ int search(int argc, const char **argv, const Command& command) {
 
     int searchMode = computeSearchMode(queryDbType, targetDbType, targetSrcDbType, par.searchType);
 
-    if((searchMode & Parameters::SEARCH_MODE_FLAG_QUERY_NUCLEOTIDE)  && (searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_NUCLEOTIDE)) {
+    if ((searchMode & Parameters::SEARCH_MODE_FLAG_QUERY_NUCLEOTIDE) && (searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_NUCLEOTIDE)) {
         setNuclSearchDefaults(&par);
     } else{
-        par.overrideParameterDescription((Command &) command, par.PARAM_STRAND.uniqid, NULL, NULL,
-                                         par.PARAM_STRAND.category | MMseqsParameter::COMMAND_EXPERT);
+        par.PARAM_STRAND.addCategory(MMseqsParameter::COMMAND_EXPERT);
     }
     // FIXME: use larger default k-mer size in target-profile case if memory is available
     // overwrite default kmerSize for target-profile searches and parse parameters again
@@ -315,11 +302,6 @@ int search(int argc, const char **argv, const Command& command) {
 //    cmd.addVariable("ALIGNMENT_DB_EXT", Parameters::isEqualDbtype(targetDbType, Parameters::DBTYPE_PROFILE_STATE_SEQ) ? ".255" : "");
     par.filenames[1] = targetDB;
     if (par.sliceSearch == true) {
-        if ((searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_PROFILE) == false) {
-            par.printUsageMessage(command, MMseqsParameter::COMMAND_ALIGN|MMseqsParameter::COMMAND_PREFILTER);
-            Debug(Debug::ERROR) << "Sliced search only works with profiles as targets.\n";
-            EXIT(EXIT_FAILURE);
-        }
 
         // By default (0), diskSpaceLimit (in bytes) will be set in the workflow to use as much as possible
         cmd.addVariable("AVAIL_DISK", SSTR(static_cast<size_t>(par.diskSpaceLimit)).c_str());
@@ -342,7 +324,6 @@ int search(int argc, const char **argv, const Command& command) {
         cmd.addVariable("ALIGNMENT_PAR", par.createParameterString(par.align).c_str());
         cmd.addVariable("SORTRESULT_PAR", par.createParameterString(par.sortresult).c_str());
         par.covMode = originalCovMode;
-        cmd.addVariable("VERBOSITY_PAR", par.createParameterString(par.onlyverbosity).c_str());
 
         program = tmpDir + "/searchslicedtargetprofile.sh";
         FileUtil::writeFile(program, searchslicedtargetprofile_sh, searchslicedtargetprofile_sh_len);
diff --git a/src/workflow/Taxonomy.cpp b/src/workflow/Taxonomy.cpp
index 9ead308..bc32b92 100644
--- a/src/workflow/Taxonomy.cpp
+++ b/src/workflow/Taxonomy.cpp
@@ -26,6 +26,31 @@ void setTaxonomyMustPassAlong(Parameters *p) {
 
 int taxonomy(int argc, const char **argv, const Command& command) {
     Parameters& par = Parameters::getInstance();
+
+    par.PARAM_ADD_BACKTRACE.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_MAX_REJECTED.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_DB_OUTPUT.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_OVERLAP.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_DB_OUTPUT.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_RESCORE_MODE.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_NUM_ITERATIONS.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_PICK_ID_FROM.addCategory(MMseqsParameter::COMMAND_EXPERT);
+    for (size_t i = 0; i < par.createdb.size(); i++){
+        par.createdb[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
+    }
+    for (size_t i = 0; i < par.extractorfs.size(); i++){
+        par.extractorfs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
+    }
+    for (size_t i = 0; i < par.translatenucs.size(); i++){
+        par.translatenucs[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
+    }
+    for (size_t i = 0; i < par.result2profile.size(); i++){
+        par.result2profile[i]->addCategory(MMseqsParameter::COMMAND_EXPERT);
+    }
+    par.PARAM_COMPRESSED.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_THREADS.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+    par.PARAM_V.removeCategory(MMseqsParameter::COMMAND_EXPERT);
+
     setTaxonomyDefaults(&par);
     par.parseParameters(argc, argv, command, true, 0, 0);
     setTaxonomyMustPassAlong(&par);
@@ -44,7 +69,7 @@ int taxonomy(int argc, const char **argv, const Command& command) {
     cmd.addVariable("RUNNER", par.runner.c_str());
 
     int alignmentMode = par.alignmentMode;
-    if (par.taxonomySearchMode == Parameters::TAXONOMY_2BLCA) {
+    if (par.taxonomySearchMode == Parameters::TAXONOMY_2BLCA || par.taxonomySearchMode == Parameters::TAXONOMY_2BLCA_APPROX) {
         // at least cov must be set for extractalignedregion
         int targetMode = (int)Parameters::ALIGNMENT_MODE_SCORE_COV;
         par.alignmentMode = std::max(par.alignmentMode, targetMode);
diff --git a/util/regression b/util/regression
index fd14d65..2d98a56 160000
--- a/util/regression
+++ b/util/regression
@@ -1 +1 @@
-Subproject commit fd14d653c296d5f99007653bc139a4486184d7ba
+Subproject commit 2d98a5602167e33dca0ec79a7ad03b62e2579464