From e594d87f3a7d30c2044977762329ab45b8b3ae9c Mon Sep 17 00:00:00 2001 From: Ruoshi Zhang Date: Thu, 28 Nov 2019 17:21:03 +0100 Subject: [PATCH] Squashed 'lib/mmseqs/' changes from 3097aa5a3..0a1348be7 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 0a1348be7 offsetalignments now correctly returns a nucleotide backtrace if needed 34da34640 include VTML40 in binary for easier access d82fdd233 Add missed target .source file for reading in convertalis b87ac140d Overload patterncompiler isMatch for pos of match 013ee7cf8 avoid appending extra tabs besthitperset 77567ee86 Update regression to fix samtools validation never getting called on linux e10238de0 Make citations extendable in inheriting tools 3a53b0a7e Reset progress if createdb has to redo cee42cf19 Remove useless buffer 27995609f Warning for compressed databases in soft split mode 4224ae469 fix warning in convertalis 9af458912 Add tax. support to convertalis 102e4299b fixed bug in offsetalignment f36513b54 corrected header len 913e40f2f Next try for strict ordering problem aa9251fd4 Update README c570e476c Fix not existing file deleted in createtaxdb.sh 173a54e16 Fix wrong weak ordering for std::sort in DBReader::sortIndex 3ac5fc142 remove travis macos build 11e273602 Add a mode to createdb to support soft linking data instead of copy 5ae5503a9 changed skipping logic for sequences with repeated kmers: skip repeated kmer when looking for matches instead of skipping whole sequence, replaced skipNRepeatKmer parameter by ignore-multi-kmer parameter with new logic 84e53091f Fix issue https://github.com/soedinglab/MMseqs2/issues/239 c0a8ed056 Merge branch 'master' of https://github.com/soedinglab/MMseqs2 99d763e13 fixed really rare edgecase of wrong kmermatch positions 88cb82700 Try fixing macOS build when coreutils is not installed 49a7568e6 compile macOS build with AppleClang 34eefdec0 Silence omptl warnings c94d00e50 Third time is hopefully the charm in deleting regression tmp results b8f968b4b Now the intermediate regression tests should actually be deleted 489708b68 New regression cleans up after itself to not run out of space d6bf14f9b Allow libomp compilation on macOS 9c224bbe4 Add threshold for k-mer only prefilter 064334269 Fix memory leak in db-load-mode 1 4ddf8bbb1 Merge branch 'master' of https://github.com/soedinglab/MMseqs2 3e3ec7771 fixed edge case in alignment calculation for length=1 d8d607b0c Include db-load-mode regression test ebb16f363 Fix #234: Prefilter with precomputed index would load invalid memory in PRELOAD_MODE_FREAD c88dd3818 fixed comparison between signed and unsigned integer error b7ffc1066 added wrapped scoring for ungapped and gapped nucleotide alignments in align module, shortened result if wrapped scoring is activated 0c4b74980 added wrapped scoring for ungapped alignments 1dd955a2c Remove mmap merge branch in mergeTargetSplits 8c79865b6 Breaking fix #179: —-lca-ranks now expects ranks semicolon separated and will return species names also semicolon separated 790d70566 Mark krona as vendored 274752cb0 fixed comparison bug in kmermatcher 2f4b77c18 Merge branch 'master' of https://github.com/soedinglab/MMseqs2 99462142c introduced template for seqLen and seqPos in KmerPosition to handle kmermatches of sequences larger than MAX_SHRT corrrectly eb090882b Invalid parameter passed to taxonomyreport in easy-taxonomy workflow 076daa363 escape species name in taxonomy report so it cant break krona 66385552c Fix broken taxonomyreport CLI again deb0e82a7 Taxonomy report can also build a Krona HTML report eba0ccf03 Fix 230: apply mode was using the wrong database entry length 1b9a22574 Fix #229: Invalid query length used in result2msa f05f8c51d reduce efficiency impact of edgecase fix in gapped alignment computation 5d78b6c41 updated regressiontest for easynuclnucltax c2e7c273a fixed edgecase for gapped alignment computation 4404fe0a7 Fix pairwise LCA function 5c69b0dee Fix issues with splitsequence if combined with compressed fff584e77 Try getc based merging instead of mmap 736e0bfe8 Fix error messages 926920014 Shorten merging debug output 461301834 Allow target split merging to happen multi-threaded again d1b5f63fa Cleanup more dead code 205740420 Some cleanup in Util 202efad1b Fix warning 62aae0335 Fix compiler issues 0f963300b Quick fix prefix id issue with lookup 0a88a9ee2 Merge branch 'master' of https://github.com/soedinglab/mmseqs2 488b14f4f Open reader git-subtree-dir: lib/mmseqs git-subtree-split: 0a1348be78bd84137bdb373ba32e0e8c054b3e1c --- .gitattributes | 1 + .travis.yml | 15 - README.md | 11 +- azure-pipelines.yml | 4 +- data/CMakeLists.txt | 2 + data/createtaxdb.sh | 2 +- data/easysearch.sh | 2 +- data/easytaxonomy.sh | 8 +- data/krona_prelude.html | 6619 ++++++++++++++++++ lib/ksw2/kseq.h | 26 +- lib/omptl/omptl_algorithm | 8 +- src/CMakeLists.txt | 13 +- src/MMseqsBase.cpp | 6 +- src/alignment/Alignment.cpp | 74 +- src/alignment/Alignment.h | 6 +- src/alignment/BandedNucleotideAligner.cpp | 73 +- src/alignment/BandedNucleotideAligner.h | 2 +- src/alignment/DistanceCalculator.h | 52 +- src/alignment/Main.cpp | 4 +- src/alignment/Matcher.cpp | 14 +- src/alignment/Matcher.h | 2 +- src/alignment/MultipleAlignment.h | 4 - src/alignment/StripedSmithWaterman.h | 5 +- src/alignment/rescorediagonal.cpp | 61 +- src/clustering/CMakeLists.txt | 1 - src/clustering/Clustering.h | 1 - src/clustering/ClusteringAlgorithms.h | 1 - src/clustering/SetElement.h | 27 - src/commons/Command.h | 18 +- src/commons/DBReader.cpp | 10 +- src/commons/DBReader.h | 69 +- src/commons/DBWriter.cpp | 4 +- src/commons/DBWriter.h | 3 + src/commons/FileUtil.cpp | 13 + src/commons/FileUtil.h | 2 + src/commons/KSeqWrapper.cpp | 7 +- src/commons/KSeqWrapper.h | 2 + src/commons/Parameters.cpp | 127 +- src/commons/Parameters.h | 38 +- src/commons/PatternCompiler.h | 5 + src/commons/Sequence.cpp | 1 - src/commons/SubstitutionMatrix.cpp | 4 + src/commons/Util.cpp | 55 +- src/commons/Util.h | 17 +- src/linclust/LinsearchIndexReader.cpp | 24 +- src/linclust/LinsearchIndexReader.h | 6 +- src/linclust/kmerindexdb.cpp | 4 +- src/linclust/kmermatcher.cpp | 276 +- src/linclust/kmermatcher.h | 40 +- src/linclust/kmersearch.cpp | 46 +- src/linclust/kmersearch.h | 8 +- src/multihit/MultiHitDb.cpp | 1 - src/multihit/MultiHitSearch.cpp | 1 - src/multihit/besthitperset.cpp | 14 +- src/prefiltering/IndexTable.h | 7 +- src/prefiltering/Prefiltering.cpp | 195 +- src/prefiltering/Prefiltering.h | 5 +- src/prefiltering/PrefilteringIndexReader.cpp | 4 +- src/prefiltering/QueryMatcher.cpp | 1 + src/prefiltering/SequenceLookup.cpp | 9 +- src/prefiltering/SequenceLookup.h | 2 +- src/taxonomy/NcbiTaxonomy.cpp | 3 +- src/taxonomy/lca.cpp | 4 +- src/taxonomy/taxonomyreport.cpp | 81 +- src/test/TestDBReader.cpp | 5 +- src/test/TestDBReaderZstd.cpp | 2 - src/test/TestDiagonalScoring.cpp | 3 - src/test/TestDiagonalScoringPerformance.cpp | 3 - src/test/TestSequenceIndex.cpp | 1 - src/util/apply.cpp | 16 +- src/util/convert2fasta.cpp | 17 +- src/util/convertalignments.cpp | 93 +- src/util/convertmsa.cpp | 1 + src/util/createdb.cpp | 194 +- src/util/createseqfiledb.cpp | 28 +- src/util/diffseqdbs.cpp | 6 +- src/util/extractdomains.cpp | 2 +- src/util/indexdb.cpp | 2 + src/util/offsetalignment.cpp | 25 +- src/util/prefixid.cpp | 18 +- src/util/result2flat.cpp | 4 +- src/util/result2msa.cpp | 9 +- src/util/result2stats.cpp | 25 +- src/util/splitsequence.cpp | 5 + src/util/subtractdbs.cpp | 2 + src/util/summarizetabs.cpp | 1 + src/workflow/EasyCluster.cpp | 1 + src/workflow/EasyLinclust.cpp | 1 + src/workflow/EasySearch.cpp | 15 +- src/workflow/EasyTaxonomy.cpp | 6 +- src/workflow/Rbh.cpp | 2 +- util/build_osx.sh | 28 +- util/regression | 2 +- 93 files changed, 7916 insertions(+), 751 deletions(-) create mode 100644 data/krona_prelude.html delete mode 100644 src/clustering/SetElement.h diff --git a/.gitattributes b/.gitattributes index a8421d5..76d0629 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,3 @@ +data/krona_prelude.html linguist-vendored lib/* linguist-vendored lib/simd linguist-vendored=false diff --git a/.travis.yml b/.travis.yml index c6796be..1bd82a3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -91,18 +91,6 @@ matrix: - libopenmpi-dev - shellcheck env: MPI=1 CC=gcc-8 CXX=g++-8 - - os: osx - osx_image: xcode10.1 - addons: - homebrew: - packages: - - cmake - - ninja - - gcc@8 - - zlib - - bzip2 - - shellcheck - env: CC=gcc-8 CXX=g++-8 allow_failures: - env: QEMU_ARM=1 fast_finish: true @@ -111,7 +99,6 @@ services: - docker before_install: - - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew link gcc@8 --force; fi - export CC - export CXX @@ -124,8 +111,6 @@ script: mkdir build; cd build; \ cmake -G Ninja -DENABLE_WERROR=1 -DHAVE_MPI="$MPI" -DHAVE_SSE4_1=1 -DHAVE_TESTS=1 -DREQUIRE_OPENMP=0 .. \ || exit 1; ninja || exit 1; \ - elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then \ - ./util/build_osx.sh . build || exit 1; \ else \ exit 1; \ fi diff --git a/README.md b/README.md index 2c9c5f6..264af27 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ MMseqs2 (Many-against-Many sequence searching) is a software suite to search and [Steinegger M and Soeding J. Clustering huge protein sequence sets in linear time. Nature Communications, doi: 10.1038/s41467-018-04964-5 (2018)](https://www.nature.com/articles/s41467-018-04964-5). -[Mirdita M, Steinegger M and Soeding J. MMseqs2 desktop and local web server app for fast, interactive sequence searches. Bioinformatics, doi: 10.1093/bioinformatics/bty1057 (2019)](https://academic.oup.com/bioinformatics/article/35/16/2856/5280135) +[Mirdita M, Steinegger M and Soeding J. MMseqs2 desktop and local web server app for fast, interactive sequence searches. Bioinformatics, doi: 10.1093/bioinformatics/bty1057 (2019)](https://academic.oup.com/bioinformatics/article/35/16/2856/5280135). [![BioConda Install](https://img.shields.io/conda/dn/bioconda/mmseqs2.svg?style=flag&label=BioConda%20install)](https://anaconda.org/bioconda/mmseqs2) [![Github All Releases](https://img.shields.io/github/downloads/soedinglab/mmseqs2/total.svg)](https://github.com/soedinglab/mmseqs2/releases/latest) @@ -20,7 +20,7 @@ MMseqs2 (Many-against-Many sequence searching) is a software suite to search and ## Documentation -The MMseqs2 user guide is available in our [GitHub Wiki](https://github.com/soedinglab/mmseqs2/wiki) or as a [PDF file](https://mmseqs.com/latest/userguide.pdf) (Thanks to [pandoc](https://github.com/jgm/pandoc)!). We provide a tutorial of MMseqs2 [here](https://github.com/soedinglab/metaG-ECCB18-partII). +The MMseqs2 user guide is available in our [GitHub Wiki](https://github.com/soedinglab/mmseqs2/wiki) or as a [PDF file](https://mmseqs.com/latest/userguide.pdf) (Thanks to [pandoc](https://github.com/jgm/pandoc)!). The wiki also contains [tutorials](https://github.com/soedinglab/MMseqs2/wiki/Tutorials) to learn how to use MMseqs2 with real data. Keep posted about MMseqs2/Linclust updates by following Martin on [Twitter](https://twitter.com/thesteinegger). @@ -61,10 +61,7 @@ Compiling MMseqs2 from source has the advantage that it will be optimized to the make install export PATH=$(pwd)/bin/:$PATH -:exclamation: To compile MMseqs2 on MacOS, first install the `gcc` compiler from Homebrew. The default MacOS `clang` compiler does not support OpenMP and MMseqs2 will only be able to use a single thread. Then use the following `cmake` call: - - CC="$(brew --prefix)/bin/gcc-9" CXX="$(brew --prefix)/bin/g++-9" cmake -DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_INSTALL_PREFIX=. .. - +:exclamation: Compiling MMseqs2 correctly on macOS requires [more effort](https://github.com/soedinglab/MMseqs2/wiki#compile-from-source-under-macos). ## Getting started We provide `easy` workflows to cluster, search and assign taxonomy. These `easy` workflows are a shorthand to deal directly with FASTA/FASTQ files as input and output. MMseqs2 provides many modules to transform, filter, execute external programs and search. However, these modules use the MMseqs2 database formats, instead of the FASTA/FASTQ format. For maximum flexibility, we recommend using MMseqs2 workflows and modules directly. Please read more about this in the [documentation](https://github.com/soedinglab/mmseqs2/wiki). @@ -81,7 +78,7 @@ For clustering, MMseqs2 `easy-cluster` and `easy-linclust` are available. mmseqs easy-linclust examples/DB.fasta clusterRes tmp -Sequence identity is in default [estimated](https://github.com/soedinglab/MMseqs2/wiki#how-does-mmseqs2-compute-the-sequence-identity) to output real sequence identity use `--alignment-mode 3`. +Sequence identity is by default [estimated](https://github.com/soedinglab/MMseqs2/wiki#how-does-mmseqs2-compute-the-sequence-identity) to output real sequence identity use `--alignment-mode 3`. Read more about the [clustering format](https://github.com/soedinglab/mmseqs2/wiki#clustering-format) in our user guide. Please adjust the [clustering criteria](https://github.com/soedinglab/MMseqs2/wiki#clustering-criteria) and check if temporary directory provides enough free space. For disk space requirements, see the user guide. diff --git a/azure-pipelines.yml b/azure-pipelines.yml index f91790c..9df46ea 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -35,11 +35,11 @@ jobs: - checkout: self submodules: true - script: | - brew install cmake gcc@9 zlib bzip2 coreutils + brew install cmake zlib bzip2 libomp displayName: Install Dependencies - script: | cd ${BUILD_SOURCESDIRECTORY} - CC=gcc-9 CXX=g++-9 ./util/build_osx.sh . build + ./util/build_osx.sh . build displayName: Build MMseqs2 - script: | ${BUILD_SOURCESDIRECTORY}/util/regression/run_regression.sh ${BUILD_SOURCESDIRECTORY}/build/build_sse41/src/mmseqs ${BUILD_SOURCESDIRECTORY}/regression diff --git a/data/CMakeLists.txt b/data/CMakeLists.txt index 26ad17f..52cdfc9 100644 --- a/data/CMakeLists.txt +++ b/data/CMakeLists.txt @@ -22,6 +22,7 @@ set(COMPILED_RESOURCES enrich.sh blastn.sh VTML80.out + VTML40.out nucleotide.out blosum62.out PAM30.out @@ -35,6 +36,7 @@ set(COMPILED_RESOURCES searchslicedtargetprofile.sh cs219.lib linsearch.sh + krona_prelude.html ) set(GENERATED_OUTPUT_HEADERS "") diff --git a/data/createtaxdb.sh b/data/createtaxdb.sh index ec998ac..595e48a 100755 --- a/data/createtaxdb.sh +++ b/data/createtaxdb.sh @@ -59,7 +59,7 @@ if [ -n "$REMOVE_TMP" ]; then rm -f "${TMP_PATH}/names.dmp" "${TMP_PATH}/nodes.dmp" "${TMP_PATH}/merged.dmp" "${TMP_PATH}/delnodes.dmp" rm -f "${TMP_PATH}/taxidmapping" if [ "$DOWNLOAD_DATA" -eq "1" ]; then - rm -f "${TMP_PATH}/ncbi_download.complete" "${TMP_PATH}/targetDB_mapping.complete" + rm -f "${TMP_PATH}/ncbi_download.complete" "${TMP_PATH}/mapping_download.complete" fi rm -f "${TMP_PATH}/targetDB_mapping.complete" rm -f "${TMP_PATH}/targetDB_mapping" diff --git a/data/easysearch.sh b/data/easysearch.sh index 3d3f002..bd922d7 100755 --- a/data/easysearch.sh +++ b/data/easysearch.sh @@ -14,7 +14,7 @@ INPUT="$INPUT" if notExists "${TMP_PATH}/query.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" createdb "$@" "${TMP_PATH}/query" ${CREATEDB_PAR} \ + "$MMSEQS" createdb "$@" "${TMP_PATH}/query" ${CREATEDB_QUERY_PAR} \ || fail "query createdb died" fi diff --git a/data/easytaxonomy.sh b/data/easytaxonomy.sh index aa4cd28..d7bd258 100755 --- a/data/easytaxonomy.sh +++ b/data/easytaxonomy.sh @@ -10,7 +10,7 @@ notExists() { if notExists "${TMP_PATH}/query.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" createdb "$@" "${TMP_PATH}/query" ${CREATEDB_PAR} \ + "$MMSEQS" createdb "$@" "${TMP_PATH}/query" ${CREATEDB_QUERY_PAR} \ || fail "query createdb died" fi @@ -40,7 +40,7 @@ fi if notExists "${TMP_PATH}/result_tophit1.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" filterdb "${TMP_PATH}/result" "${TMP_PATH}/result_top1" --extract-lines 1 ${THREADS_PAR} \ + "$MMSEQS" filterdb "${TMP_PATH}/result" "${TMP_PATH}/result_top1" --extract-lines 1 ${THREADS_COMP_PAR} \ || fail "filterdb died" fi @@ -52,13 +52,13 @@ fi if notExists "${TMP_PATH}/result_top1_swapped_sum.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" summarizealis "${TMP_PATH}/result_top1_swapped" "${TMP_PATH}/result_top1_swapped_sum" ${THREADS_PAR} \ + "$MMSEQS" summarizealis "${TMP_PATH}/result_top1_swapped" "${TMP_PATH}/result_top1_swapped_sum" ${THREADS_COMP_PAR} \ || fail "filterdb died" fi if notExists "${TMP_PATH}/result_top1_swapped_sum_tax.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" addtaxonomy "${TARGET}" "${TMP_PATH}/result_top1_swapped_sum" "${TMP_PATH}/result_top1_swapped_sum_tax" ${THREADS_PAR} --pick-id-from 1 --tax-lineage \ + "$MMSEQS" addtaxonomy "${TARGET}" "${TMP_PATH}/result_top1_swapped_sum" "${TMP_PATH}/result_top1_swapped_sum_tax" ${THREADS_COMP_PAR} --pick-id-from 1 --tax-lineage \ || fail "filterdb died" fi diff --git a/data/krona_prelude.html b/data/krona_prelude.html new file mode 100644 index 0000000..b1f3084 --- /dev/null +++ b/data/krona_prelude.html @@ -0,0 +1,6619 @@ + + + + + + + + + + + + + +
+ + + magnitude + + + kt + + diff --git a/lib/ksw2/kseq.h b/lib/ksw2/kseq.h index 76c7666..47b354e 100644 --- a/lib/ksw2/kseq.h +++ b/lib/ksw2/kseq.h @@ -41,6 +41,8 @@ typedef struct __kstream_t { \ char *buf; \ int begin, end, is_eof; \ + size_t cur_buf_pos; \ + size_t newline; \ type_t f; \ } kstream_t; @@ -71,6 +73,7 @@ if (ks->is_eof && ks->begin >= ks->end) return -1; \ if (ks->begin >= ks->end) { \ ks->begin = 0; \ + ks->cur_buf_pos += ks->end; \ ks->end = __read(ks->f, ks->buf, __bufsize); \ if (ks->end == 0) { ks->is_eof = 1; return -1;} \ if (ks->end == -1) { ks->is_eof = 1; return -3;}\ @@ -102,6 +105,7 @@ typedef struct __kstring_t { if (ks->begin >= ks->end) { \ if (!ks->is_eof) { \ ks->begin = 0; \ + ks->cur_buf_pos += ks->end; \ ks->end = __read(ks->f, ks->buf, __bufsize); \ if (ks->end == 0) { ks->is_eof = 1; break; } \ if (ks->end == -1) { ks->is_eof = 1; return -3; } \ @@ -109,7 +113,7 @@ typedef struct __kstring_t { } \ if (delimiter == KS_SEP_LINE) { \ for (i = ks->begin; i < ks->end; ++i) \ - if (ks->buf[i] == '\n') break; \ + if (ks->buf[i] == '\n') { ks->newline+=(append == 1); break; } \ } else if (delimiter > KS_SEP_MAX) { \ for (i = ks->begin; i < ks->end; ++i) \ if (ks->buf[i] == delimiter) break; \ @@ -158,6 +162,9 @@ typedef struct __kstring_t { { \ kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ s->f = ks_init(fd); \ + s->f->end=0; \ + s->f->cur_buf_pos=0; \ + s->f->newline=0; \ return s; \ } \ SCOPE void kseq_destroy(kseq_t *ks) \ @@ -179,11 +186,15 @@ typedef struct __kstring_t { { \ int c,r; \ kstream_t *ks = seq->f; \ - if (seq->last_char == 0) { /* then jump to the next header line */ \ + ks->newline = 0; \ + if (seq->last_char == 0) { /* then jump to the next header line */ \ while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '@'); \ + seq->offset = ks->cur_buf_pos + ks->begin; \ if (c < 0) return c; /* end of file or error*/ \ seq->last_char = c; \ - } /* else: the first header char has been read in the previous call */ \ + } else{ /* else: the first header char has been read in the previous call */ \ + seq->offset = ks->cur_buf_pos + ks->begin; \ + } \ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ if ((r=ks_getuntil(ks, 0, &seq->name, &c)) < 0) return r; /* normal exit: EOF or error */ \ if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ @@ -192,11 +203,14 @@ typedef struct __kstring_t { seq->seq.s = (char*)malloc(seq->seq.m); \ } \ while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '+' && c != '@') { \ - if (c == '\n') continue; /* skip empty lines */ \ + if (c == '\n') { ks->newline++; continue; } /* skip empty lines */ \ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ } \ - if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ + seq->multiline = (ks->newline > 1); \ + if (c == '>' || c == '@') { /* the first header char has been read */ \ + seq->last_char = c; \ + } \ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ seq->seq.m = seq->seq.l + 2; \ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ @@ -221,6 +235,8 @@ typedef struct __kstring_t { typedef struct { \ kstring_t name, comment, seq, qual; \ int last_char; \ + size_t offset; \ + bool multiline; \ kstream_t *f; \ } kseq_t; diff --git a/lib/omptl/omptl_algorithm b/lib/omptl/omptl_algorithm index 4750868..7058d34 100644 --- a/lib/omptl/omptl_algorithm +++ b/lib/omptl/omptl_algorithm @@ -552,10 +552,16 @@ ForwardIterator upper_bound(ForwardIterator first, ForwardIterator last, } // namespace omptl +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-parameter" +#endif #ifdef _OPENMP #include #else #include #endif - +#if defined(__clang__) +#pragma clang diagnostic pop +#endif #endif /* OMPTL_ALGORITHM */ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 897adfc..c7299e9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -120,8 +120,13 @@ endif () # SIMD instruction sets support if (HAVE_AVX2) target_compile_definitions(mmseqs-framework PUBLIC -DAVX2=1) - append_target_property(mmseqs-framework COMPILE_FLAGS -mavx2 -Wa,-q) - append_target_property(mmseqs-framework LINK_FLAGS -mavx2 -Wa,-q) + if (CMAKE_COMPILER_IS_CLANG) + append_target_property(mmseqs-framework COMPILE_FLAGS -mavx2) + append_target_property(mmseqs-framework LINK_FLAGS -mavx2) + else () + append_target_property(mmseqs-framework COMPILE_FLAGS -mavx2 -Wa,-q) + append_target_property(mmseqs-framework LINK_FLAGS -mavx2 -Wa,-q) + endif () elseif (HAVE_SSE4_1) target_compile_definitions(mmseqs-framework PUBLIC -DSSE=1) append_target_property(mmseqs-framework COMPILE_FLAGS -msse4.1) @@ -226,7 +231,9 @@ if (OPENMP_FOUND) # For GCC we dont want to do this since it breaks macOS static builds # It will link libgomp.a internally (through -fopenmp I guess) # and also link libgomp.dylib thus breaking static builds - # target_link_libraries(mmseqs-framework ${OpenMP_CXX_LIBRARIES}) + if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + target_link_libraries(mmseqs-framework ${OpenMP_CXX_LIBRARIES}) + endif() append_target_property(mmseqs-framework COMPILE_FLAGS ${OpenMP_CXX_FLAGS}) append_target_property(mmseqs-framework LINK_FLAGS ${OpenMP_CXX_FLAGS}) elseif (REQUIRE_OPENMP) diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index 335e445..8797c46 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -209,10 +209,10 @@ std::vector baseCommands = { CITATION_MMSEQS2, {{"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_TAXONOMY, &DbValidator::taxSequenceDb }, {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb }, {"taxDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::taxResult }}}, - {"taxonomyreport", taxonomyreport, &par.threadsandcompression, COMMAND_TAXONOMY, - "Create Kraken-style taxonomy report.", + {"taxonomyreport", taxonomyreport, &par.taxonomyreport, COMMAND_TAXONOMY, + "Create a taxonomy report in either Kraken or Krona mode.", NULL, - "Florian Breitwieser ", + "Milot Mirdita & Florian Breitwieser ", " ", CITATION_MMSEQS2, {{"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_TAXONOMY, &DbValidator::taxSequenceDb }, {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::taxResult }, diff --git a/src/alignment/Alignment.cpp b/src/alignment/Alignment.cpp index a6b9130..b49df9f 100644 --- a/src/alignment/Alignment.cpp +++ b/src/alignment/Alignment.cpp @@ -81,6 +81,21 @@ Alignment::Alignment(const std::string &querySeqDB, } initSWMode(alignmentMode); + if (par.wrappedScoring) + { + maxSeqLen = maxSeqLen * 2; + if(!Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)){ + Debug(Debug::ERROR) << "Wrapped scoring is only supported for nucleotides.\n"; + EXIT(EXIT_FAILURE); + } + + if (realign == true) { + Debug(Debug::ERROR) << "Alternative alignments do not supported wrapped scoring.\n"; + EXIT(EXIT_FAILURE); + } + } + + //qdbr->readMmapedDataInMemory(); // make sure to touch target after query, so if there is not enough memory for the query, at least the targets // might have had enough space left to be residung in the page cache @@ -212,7 +227,7 @@ Alignment::~Alignment() { } void Alignment::run(const unsigned int mpiRank, const unsigned int mpiNumProc, - const unsigned int maxAlnNum, const unsigned int maxRejected) { + const unsigned int maxAlnNum, const unsigned int maxRejected, bool wrappedScoring) { size_t dbFrom = 0; size_t dbSize = 0; @@ -220,7 +235,7 @@ void Alignment::run(const unsigned int mpiRank, const unsigned int mpiNumProc, Debug(Debug::INFO) << "Compute split from " << dbFrom << " to " << (dbFrom + dbSize) << "\n"; std::pair tmpOutput = Util::createTmpFileNames(outDB, outDBIndex, mpiRank); - run(tmpOutput.first, tmpOutput.second, dbFrom, dbSize, maxAlnNum, maxRejected, true); + run(tmpOutput.first, tmpOutput.second, dbFrom, dbSize, maxAlnNum, maxRejected, true, wrappedScoring); #ifdef HAVE_MPI MPI_Barrier(MPI_COMM_WORLD); @@ -237,13 +252,13 @@ void Alignment::run(const unsigned int mpiRank, const unsigned int mpiNumProc, } } -void Alignment::run(const unsigned int maxAlnNum, const unsigned int maxRejected) { - run(outDB, outDBIndex, 0, prefdbr->getSize(), maxAlnNum, maxRejected, false); +void Alignment::run(const unsigned int maxAlnNum, const unsigned int maxRejected, bool wrappedScoring) { + run(outDB, outDBIndex, 0, prefdbr->getSize(), maxAlnNum, maxRejected, false, wrappedScoring); } void Alignment::run(const std::string &outDB, const std::string &outDBIndex, const size_t dbFrom, const size_t dbSize, - const unsigned int maxAlnNum, const unsigned int maxRejected, bool merge) { + const unsigned int maxAlnNum, const unsigned int maxRejected, bool merge, bool wrappedScoring) { size_t alignmentsNum = 0; size_t totalPassedNum = 0; DBWriter dbw(outDB.c_str(), outDBIndex.c_str(), threads, compressed, Parameters::DBTYPE_ALIGNMENT_RES); @@ -281,7 +296,7 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex, Sequence dbSeq(maxSeqLen, targetSeqType, m, 0, false, compBiasCorrection); Matcher matcher(querySeqType, maxSeqLen, m, &evaluer, compBiasCorrection, gapOpen, gapExtend); Matcher *realigner = NULL; - if (realign == true) { + if (realign == true && wrappedScoring == false) { realigner = new Matcher(querySeqType, maxSeqLen, realign_m, &evaluer, compBiasCorrection, gapOpen, gapExtend); } @@ -289,6 +304,8 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex, swResults.reserve(300); std::vector swRealignResults; swRealignResults.reserve(300); + std::vector shortResults; + shortResults.reserve(300); #pragma omp for schedule(dynamic, 5) reduction(+: alignmentsNum, totalPassedNum) for (size_t id = start; id < (start + bucketSize); id++) { @@ -297,6 +314,8 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex, // get the prefiltering list char *data = prefdbr->getData(id, thread_idx); unsigned int queryDbKey = prefdbr->getDbKey(id); + size_t queryLen = -1, origQueryLen = -1; + std::string queryToWrap; // only load query data if data != \0 if(*data != '\0'){ size_t qId = qdbr->getId(queryDbKey); @@ -306,7 +325,16 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex, << " is required in the prefiltering, but is not contained in the query sequence database.\nPlease check your database.\n"; EXIT(EXIT_FAILURE); } - qSeq.mapSequence(qId, queryDbKey, querySeqData, qdbr->getSeqLen(qId)); + queryLen = qdbr->getSeqLen(qId); + origQueryLen = queryLen; + if (wrappedScoring) { + queryToWrap = std::string(querySeqData,queryLen); + queryToWrap = queryToWrap + queryToWrap; + querySeqData = (char*)(queryToWrap).c_str(); + queryLen = origQueryLen*2; + } + + qSeq.mapSequence(qId, queryDbKey, querySeqData, queryLen); matcher.initQuery(&qSeq); } @@ -338,7 +366,7 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex, } dbSeq.mapSequence(dbId, dbKey, dbSeqData, tdbr->getSeqLen(dbId)); // check if the sequences could pass the coverage threshold - if(Util::canBeCovered(canCovThr, covMode, static_cast(qSeq.L), static_cast(dbSeq.L)) == false) { + if(Util::canBeCovered(canCovThr, covMode, static_cast(origQueryLen), static_cast(dbSeq.L)) == false) { rejected++; data = Util::skipLine(data); continue; @@ -346,7 +374,7 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex, const bool isIdentity = (queryDbKey == dbKey && (includeIdentity || sameQTDB)) ? true : false; // calculate Smith-Waterman alignment - Matcher::result_t res = matcher.getSWResult(&dbSeq, static_cast(diagonal), isReverse, covMode, covThr, evalThr, swMode, seqIdMode, isIdentity); + Matcher::result_t res = matcher.getSWResult(&dbSeq, static_cast(diagonal), isReverse, covMode, covThr, evalThr, swMode, seqIdMode, isIdentity, wrappedScoring); alignmentsNum++; //set coverage and seqid if identity @@ -356,7 +384,15 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex, res.seqId = 1.0f; } if(checkCriteria(res, isIdentity, evalThr, seqIdThr, alnLenThr, covMode, covThr)){ - swResults.emplace_back(res); + if(wrappedScoring){ + hit_t hit; + hit.seqId = res.dbKey; + hit.prefScore = (isReverse?-100:100) * res.seqId; + hit.diagonal = isReverse?res.qStartPos-res.dbEndPos:res.qStartPos-res.dbStartPos; + shortResults.emplace_back(hit); + } + else + swResults.emplace_back(res); passedNum++; totalPassedNum++; rejected = 0; @@ -366,12 +402,16 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex, data = Util::skipLine(data); } - if(altAlignment > 0 && realign == false ){ + if(altAlignment > 0 && realign == false && wrappedScoring == false){ computeAlternativeAlignment(queryDbKey, dbSeq, swResults, matcher, evalThr, swMode, thread_idx); } + if(wrappedScoring && shortResults.size() > 1) + std::sort(shortResults.begin(), shortResults.end(), hit_t::compareHitsByScoreAndId); + // write the results - std::sort(swResults.begin(), swResults.end(), Matcher::compareHits); + if(swResults.size() > 1) + std::sort(swResults.begin(), swResults.end(), Matcher::compareHits); if (realign == true) { realigner->initQuery(&qSeq); for (size_t result = 0; result < swResults.size(); result++) { @@ -401,7 +441,7 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex, } } swResults = swRealignResults; - if(altAlignment> 0 ){ + if(altAlignment > 0){ computeAlternativeAlignment(queryDbKey, dbSeq, swResults, matcher, FLT_MAX, Matcher::SCORE_COV_SEQID, thread_idx); } } @@ -411,10 +451,18 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex, size_t len = Matcher::resultToBuffer(buffer, swResults[result], addBacktrace); alnResultsOutString.append(buffer, len); } + + for (size_t result = 0; result < shortResults.size(); result++) { + size_t len = snprintf(buffer, 100, "%u\t%d\t%d\n", shortResults[result].seqId, shortResults[result].prefScore, + shortResults[result].diagonal); + alnResultsOutString.append(buffer, len); + } + dbw.writeData(alnResultsOutString.c_str(), alnResultsOutString.length(), queryDbKey, thread_idx); alnResultsOutString.clear(); swResults.clear(); swRealignResults.clear(); + shortResults.clear(); } if (realign == true) { delete realigner; diff --git a/src/alignment/Alignment.h b/src/alignment/Alignment.h index 4583300..c597b12 100644 --- a/src/alignment/Alignment.h +++ b/src/alignment/Alignment.h @@ -24,16 +24,16 @@ class Alignment { ~Alignment(); //None MPI - void run(const unsigned int maxAlnNum, const unsigned int maxRejected); + void run(const unsigned int maxAlnNum, const unsigned int maxRejected, bool wrappedScoring=false); //MPI function void run(const unsigned int mpiRank, const unsigned int mpiNumProc, - const unsigned int maxAlnNum, const unsigned int maxRejected); + const unsigned int maxAlnNum, const unsigned int maxRejected, bool wrappedScoring=false); //Run parallel void run(const std::string &outDB, const std::string &outDBIndex, const size_t dbFrom, const size_t dbSize, - const unsigned int maxAlnNum, const unsigned int maxRejected, bool merge); + const unsigned int maxAlnNum, const unsigned int maxRejected, bool merge, bool wrappedScoring=false); static bool checkCriteria(Matcher::result_t &res, bool isIdentity, double evalThr, double seqIdThr, int alnLenThr, int covMode, float covThr); diff --git a/src/alignment/BandedNucleotideAligner.cpp b/src/alignment/BandedNucleotideAligner.cpp index fda05ef..81a0720 100644 --- a/src/alignment/BandedNucleotideAligner.cpp +++ b/src/alignment/BandedNucleotideAligner.cpp @@ -71,7 +71,7 @@ void BandedNucleotideAligner::initQuery(Sequence * query){ s_align BandedNucleotideAligner::align(Sequence * targetSeqObj, int diagonal, bool reverse, std::string & backtrace, int & aaIds, - EvalueComputation * evaluer) + EvalueComputation * evaluer, bool wrappedScoring) { char * queryCharSeqAlign = (char*) querySeqObj->getSeqData(); uint8_t * querySeqRevAlign = querySeqRev; @@ -89,9 +89,22 @@ s_align BandedNucleotideAligner::align(Sequence * targetSeqObj, int qUngappedStartPos, qUngappedEndPos, dbUngappedStartPos, dbUngappedEndPos; - DistanceCalculator::LocalAlignment alignment = DistanceCalculator::computeUngappedAlignment( - queryCharSeqAlign, querySeqObj->L, targetSeqObj->getSeqData(), targetSeqObj->L, - diagonal, fastMatrix.matrix, Parameters::RESCORE_MODE_ALIGNMENT); + DistanceCalculator::LocalAlignment alignment; + int queryLen = querySeqObj->L; + int origQueryLen = queryLen; + if (wrappedScoring) { + alignment = DistanceCalculator::computeUngappedWrappedAlignment( + queryCharSeqAlign, querySeqObj->L, targetSeqObj->getSeqData(), targetSeqObj->L, + diagonal, fastMatrix.matrix, Parameters::RESCORE_MODE_ALIGNMENT); + origQueryLen = queryLen/2; + } + else { + alignment = DistanceCalculator::computeUngappedAlignment( + queryCharSeqAlign, querySeqObj->L, targetSeqObj->getSeqData(), targetSeqObj->L, + diagonal, fastMatrix.matrix, Parameters::RESCORE_MODE_ALIGNMENT); + } + + unsigned int distanceToDiagonal = alignment.distToDiagonal; diagonal = alignment.diagonal; @@ -106,13 +119,12 @@ s_align BandedNucleotideAligner::align(Sequence * targetSeqObj, dbUngappedStartPos = alignment.startPos + distanceToDiagonal; dbUngappedEndPos = alignment.endPos + distanceToDiagonal; } - - if(qUngappedStartPos == 0 && qUngappedEndPos == querySeqObj->L -1 + if(qUngappedEndPos-qUngappedStartPos == origQueryLen - 1 && dbUngappedStartPos == 0 && dbUngappedEndPos == targetSeqObj->L - 1){ s_align result; uint32_t * retCigar = new uint32_t[1]; retCigar[0] = 0; - retCigar[0] = querySeqObj->L << 4; + retCigar[0] = origQueryLen << 4; result.cigar = retCigar; result.cigarLen = 1; result.score1 = alignment.score; @@ -121,12 +133,14 @@ s_align BandedNucleotideAligner::align(Sequence * targetSeqObj, result.dbEndPos1 = dbUngappedEndPos; result.dbStartPos1 = dbUngappedStartPos; result.qCov = SmithWaterman::computeCov(result.qStartPos1, result.qEndPos1, querySeqObj->L); + if(wrappedScoring) + result.qCov = std::min(1.0f, result.qCov*2); result.tCov = SmithWaterman::computeCov(result.dbStartPos1, result.dbEndPos1, targetSeqObj->L); - result.evalue = evaluer->computeEvalue(result.score1, querySeqObj->L); + result.evalue = evaluer->computeEvalue(result.score1, origQueryLen); for (int i = qUngappedStartPos; i <= qUngappedEndPos; i++) { aaIds += (querySeqAlign[i] == targetSeq[dbUngappedStartPos + (i - dbUngappedStartPos)]) ? 1 : 0; } - for(int pos = 0; pos < querySeqObj->L; pos++){ + for(int pos = 0; pos < origQueryLen; pos++){ backtrace.append("M"); } return result; @@ -141,9 +155,14 @@ s_align BandedNucleotideAligner::align(Sequence * targetSeqObj, int flag = 0; flag |= KSW_EZ_SCORE_ONLY; flag |= KSW_EZ_EXTZ_ONLY; - ksw_extz2_sse(0, querySeqObj->L - qStartRev, querySeqRevAlign + qStartRev, targetSeqObj->L - tStartRev, targetSeqRev + tStartRev, 5, mat, gapo, gape, 64, 40, flag, &ez); - int qStartPos = querySeqObj->L - ( qStartRev + ez.max_q ) -1 ; + int queryRevLenToAlign = querySeqObj->L - qStartRev; + if (wrappedScoring && queryRevLenToAlign > origQueryLen) + queryRevLenToAlign = origQueryLen; + + ksw_extz2_sse(0, queryRevLenToAlign, querySeqRevAlign + qStartRev, targetSeqObj->L - tStartRev, targetSeqRev + tStartRev, 5, mat, gapo, gape, 64, 40, flag, &ez); + + int qStartPos = querySeqObj->L - ( qStartRev + ez.max_q ) -1; int tStartPos = targetSeqObj->L - ( tStartRev + ez.max_t ) -1; int alignFlag = 0; @@ -153,14 +172,33 @@ s_align BandedNucleotideAligner::align(Sequence * targetSeqObj, // ezAlign.cigar = cigar; // printf("%d %d\n", qStartPos, tStartPos); memset(&ezAlign, 0, sizeof(ksw_extz_t)); - ksw_extz2_sse(0, querySeqObj->L-qStartPos, querySeqAlign+qStartPos, targetSeqObj->L-tStartPos, targetSeq+tStartPos, 5, + + int queryLenToAlign = querySeqObj->L-qStartPos; + if (wrappedScoring && queryLenToAlign > origQueryLen) + queryLenToAlign = origQueryLen; + ksw_extz2_sse(0, queryLenToAlign, querySeqAlign+qStartPos, targetSeqObj->L-tStartPos, targetSeq+tStartPos, 5, mat, gapo, gape, 64, 40, alignFlag, &ezAlign); std::string letterCode = "MID"; - uint32_t * retCigar = new uint32_t[ezAlign.n_cigar]; - for(int i = 0; i < ezAlign.n_cigar; i++){ - retCigar[i]=ezAlign.cigar[i]; + uint32_t * retCigar; + + if (ez.max_q > ezAlign.max_q && ez.max_t > ezAlign.max_t){ + + ksw_extz2_sse(0, queryRevLenToAlign, querySeqRevAlign + qStartRev, targetSeqObj->L - tStartRev, + targetSeqRev + tStartRev, 5, mat, gapo, gape, 64, 40, alignFlag, &ezAlign); + + retCigar = new uint32_t[ezAlign.n_cigar]; + for(int i = 0; i < ezAlign.n_cigar; i++){ + retCigar[i]=ezAlign.cigar[ezAlign.n_cigar-1-i]; + } } + else { + retCigar = new uint32_t[ezAlign.n_cigar]; + for(int i = 0; i < ezAlign.n_cigar; i++){ + retCigar[i]=ezAlign.cigar[i]; + } + } + s_align result; result.cigar = retCigar; result.cigarLen = ezAlign.n_cigar; @@ -170,8 +208,10 @@ s_align BandedNucleotideAligner::align(Sequence * targetSeqObj, result.dbEndPos1 = tStartPos+ezAlign.max_t; result.dbStartPos1 = tStartPos; result.qCov = SmithWaterman::computeCov(result.qStartPos1, result.qEndPos1, querySeqObj->L); + if(wrappedScoring) + result.qCov = std::min(1.0f, result.qCov*2); result.tCov = SmithWaterman::computeCov(result.dbStartPos1, result.dbEndPos1, targetSeqObj->L); - result.evalue = evaluer->computeEvalue(result.score1, querySeqObj->L); + result.evalue = evaluer->computeEvalue(result.score1, origQueryLen); if(result.cigar){ int32_t targetPos = result.dbStartPos1, queryPos = result.qStartPos1; for (int32_t c = 0; c < result.cigarLen; ++c) { @@ -200,6 +240,7 @@ s_align BandedNucleotideAligner::align(Sequence * targetSeqObj, } } } + free(ezAlign.cigar); return result; // std::cout << static_cast(aaIds)/ static_cast(alignment.len) << std::endl; diff --git a/src/alignment/BandedNucleotideAligner.h b/src/alignment/BandedNucleotideAligner.h index 01a2231..364bbb1 100644 --- a/src/alignment/BandedNucleotideAligner.h +++ b/src/alignment/BandedNucleotideAligner.h @@ -24,7 +24,7 @@ class BandedNucleotideAligner { void initQuery(Sequence *q); s_align align(Sequence * targetSeqObj, int diagonal, bool reverse, - std::string & backtrace, int & aaIds, EvalueComputation * evaluer); + std::string & backtrace, int & aaIds, EvalueComputation * evaluer, bool wrappedScoring=false); private: SubstitutionMatrix::FastMatrix fastMatrix; diff --git a/src/alignment/DistanceCalculator.h b/src/alignment/DistanceCalculator.h index 20ea7de..5e99239 100644 --- a/src/alignment/DistanceCalculator.h +++ b/src/alignment/DistanceCalculator.h @@ -54,6 +54,42 @@ class DistanceCalculator { }; + template + static LocalAlignment computeUngappedWrappedAlignment(const T *querySeq, unsigned int querySeqLen, + const T *dbSeq, unsigned int dbSeqLen, + const unsigned short diagonal, const char **subMat, int alnMode){ + /* expect: querySeq = originQuerySeq+originQuerySeq + queryLen = len(querySeq) */ + + LocalAlignment max; + for(unsigned int devisions = 1; (-devisions * 65536 + diagonal) > -dbSeqLen; devisions++) { + + int realDiagonal = (-devisions * 65536 + diagonal) + querySeqLen/2; + LocalAlignment tmp = ungappedAlignmentByDiagonal(querySeq + realDiagonal, querySeqLen/2, dbSeq, dbSeqLen, 0, subMat, alnMode); + tmp.diagonal += realDiagonal; + tmp.distToDiagonal = abs(realDiagonal); + + if(tmp.score > max.score){ + max = tmp; + } + } + for(unsigned int devisions = 0; (devisions * 65536 + diagonal) < querySeqLen/2; devisions++) { + int realDiagonal = (devisions * 65536 + diagonal); + LocalAlignment tmp = ungappedAlignmentByDiagonal(querySeq + realDiagonal, querySeqLen/2, dbSeq, dbSeqLen, 0, subMat, alnMode); + + tmp.diagonal += realDiagonal; + tmp.distToDiagonal = abs(realDiagonal); + + if(tmp.score > max.score){ + max = tmp; + } + } + unsigned int minSeqLen = std::min(dbSeqLen, querySeqLen/2); + max.diagonalLen = minSeqLen; + + return max; + } + template static LocalAlignment computeUngappedAlignment(const T *querySeq, unsigned int querySeqLen, const T *dbSeq, unsigned int dbSeqLen, @@ -170,8 +206,10 @@ class DistanceCalculator { const unsigned int length, const char **subMat) { - unsigned int first = (seq1[0] =='*' || seq2[0] == '*')? 1:0; - unsigned int last = (seq1[length-1] =='*' || seq2[length-1] == '*')? length-2 : length-1; + unsigned int first = (seq1[0] == '*' || seq2[0] == '*')? 1:0; + unsigned int last = length-1; + if (last > 0 && (seq1[length-1] =='*' || seq2[length-1] == '*')) + last--; int64_t score = 0; for(unsigned int pos = first; pos <= last; pos++){ int curr = subMat[static_cast(seq1[pos])][static_cast(seq2[pos])]; @@ -190,20 +228,21 @@ class DistanceCalculator { uint64_t window = 0; uint64_t windowMask = (uint64_t)1 << (windowSize-1); unsigned int currErrors = 0; - unsigned int maxLength = 0; + unsigned int maxLength = 0; unsigned int currLength = 0; unsigned int maxEndPos = 0; unsigned int maxStartPos = 0; int maxScore = 0; unsigned int first = (seq1[0] =='*' || seq2[0] == '*')? 1:0; - unsigned int last = (seq1[length-1] =='*' || seq2[length-1] == '*')? length-2 : length-1; - + unsigned int last = length-1; + if (last > 0 && (seq1[length-1] =='*' || seq2[length-1] == '*')) + last--; unsigned int startPos = first; for(unsigned int pos = first; pos <= last; pos++){ bool currMatch = seq1[pos]==seq2[pos]; if(window & windowMask){ - currErrors -=1; + currErrors -= 1; } window = window << (uint64_t) 1; @@ -256,6 +295,7 @@ class DistanceCalculator { } + /* * Adapted from levenshtein.js (https://gist.github.com/andrei-m/982927) * Changed to hold only one row of the dynamic programing matrix diff --git a/src/alignment/Main.cpp b/src/alignment/Main.cpp index 31281f9..7a3094a 100644 --- a/src/alignment/Main.cpp +++ b/src/alignment/Main.cpp @@ -22,9 +22,9 @@ int align(int argc, const char **argv, const Command& command) { Debug(Debug::INFO) << "Calculation of alignments\n"; #ifdef HAVE_MPI - aln.run(MMseqsMPI::rank, MMseqsMPI::numProc, par.maxAccept, par.maxRejected); + aln.run(MMseqsMPI::rank, MMseqsMPI::numProc, par.maxAccept, par.maxRejected, par.wrappedScoring); #else - aln.run(par.maxAccept, par.maxRejected); + aln.run(par.maxAccept, par.maxRejected, par.wrappedScoring); #endif return EXIT_SUCCESS; diff --git a/src/alignment/Matcher.cpp b/src/alignment/Matcher.cpp index 5d06da7..59677df 100644 --- a/src/alignment/Matcher.cpp +++ b/src/alignment/Matcher.cpp @@ -59,9 +59,11 @@ void Matcher::initQuery(Sequence* query){ Matcher::result_t Matcher::getSWResult(Sequence* dbSeq, const int diagonal, bool isReverse, const int covMode, const float covThr, - const double evalThr, unsigned int alignmentMode, unsigned int seqIdMode, bool isIdentity){ + const double evalThr, unsigned int alignmentMode, unsigned int seqIdMode, bool isIdentity, + bool wrappedScoring){ // calculation of the score and traceback of the alignment int32_t maskLen = currentQuery->L / 2; + int origQueryLen = wrappedScoring? currentQuery->L / 2 : currentQuery->L ; // calcuate stop score // const double qL = static_cast(currentQuery->L); @@ -83,7 +85,7 @@ Matcher::result_t Matcher::getSWResult(Sequence* dbSeq, const int diagonal, bool << "Please check your database.\n"; EXIT(EXIT_FAILURE); } - alignment = nuclaligner->align(dbSeq, diagonal, isReverse, backtrace, aaIds, evaluer); + alignment = nuclaligner->align(dbSeq, diagonal, isReverse, backtrace, aaIds, evaluer, wrappedScoring); alignmentMode = Matcher::SCORE_COV_SEQID; }else{ if(isIdentity==false){ alignment = aligner->ssw_align(dbSeq->int_sequence, dbSeq->L, gapOpen, gapExtend, alignmentMode, evalThr, evaluer, covMode, covThr, maskLen); @@ -121,7 +123,7 @@ Matcher::result_t Matcher::getSWResult(Sequence* dbSeq, const int diagonal, bool } } } else { - for (int32_t c = 0; c < currentQuery->L; ++c) { + for (int32_t c = 0; c < origQueryLen; ++c) { aaIds++; backtrace.append("M"); } @@ -154,7 +156,7 @@ Matcher::result_t Matcher::getSWResult(Sequence* dbSeq, const int diagonal, bool // OVERWRITE alnLength with gapped value alnLength = backtrace.size(); } - seqId = Util::computeSeqId(seqIdMode, aaIds, currentQuery->L, dbSeq->L, alnLength); + seqId = Util::computeSeqId(seqIdMode, aaIds, origQueryLen, dbSeq->L, alnLength); }else if( alignmentMode == Matcher::SCORE_COV){ // "20% 30% 40% 50% 60% 70% 80% 90% 99%" @@ -176,9 +178,9 @@ Matcher::result_t Matcher::getSWResult(Sequence* dbSeq, const int diagonal, bool result_t result; if(isReverse){ - result = result_t(dbSeq->getDbKey(), bitScore, qcov, dbcov, seqId, evalue, alnLength, qStartPos, qEndPos, currentQuery->L, dbEndPos, dbStartPos, dbSeq->L, backtrace); + result = result_t(dbSeq->getDbKey(), bitScore, qcov, dbcov, seqId, evalue, alnLength, qStartPos, qEndPos, origQueryLen, dbEndPos, dbStartPos, dbSeq->L, backtrace); }else{ - result = result_t(dbSeq->getDbKey(), bitScore, qcov, dbcov, seqId, evalue, alnLength, qStartPos, qEndPos, currentQuery->L, dbStartPos, dbEndPos, dbSeq->L, backtrace); + result = result_t(dbSeq->getDbKey(), bitScore, qcov, dbcov, seqId, evalue, alnLength, qStartPos, qEndPos, origQueryLen, dbStartPos, dbEndPos, dbSeq->L, backtrace); } diff --git a/src/alignment/Matcher.h b/src/alignment/Matcher.h index 45e1947..fbeb2ca 100644 --- a/src/alignment/Matcher.h +++ b/src/alignment/Matcher.h @@ -128,7 +128,7 @@ class Matcher{ // run SSE2 parallelized Smith-Waterman alignment calculation and traceback result_t getSWResult(Sequence* dbSeq, const int diagonal, bool isReverse, const int covMode, const float covThr, const double evalThr, - unsigned int alignmentMode, unsigned int seqIdMode, bool isIdentical); + unsigned int alignmentMode, unsigned int seqIdMode, bool isIdentical, bool wrappedScoring=false); // need for sorting the results static bool compareHits (const result_t &first, const result_t &second){ diff --git a/src/alignment/MultipleAlignment.h b/src/alignment/MultipleAlignment.h index 7ce9781..9c7d3ca 100644 --- a/src/alignment/MultipleAlignment.h +++ b/src/alignment/MultipleAlignment.h @@ -10,10 +10,6 @@ #include "Matcher.h" -class SubstitutionMatrix; -class Sequence; - - class MultipleAlignment { public: enum alignment_element { diff --git a/src/alignment/StripedSmithWaterman.h b/src/alignment/StripedSmithWaterman.h index 99c866c..79b3984 100644 --- a/src/alignment/StripedSmithWaterman.h +++ b/src/alignment/StripedSmithWaterman.h @@ -46,7 +46,6 @@ #include "Sequence.h" #include "EvalueComputation.h" - typedef struct { short qStartPos; short dbStartPos; @@ -56,8 +55,8 @@ typedef struct { typedef struct { - uint16_t score1; - uint16_t score2; + uint32_t score1; + uint32_t score2; int32_t dbStartPos1; int32_t dbEndPos1; int32_t qStartPos1; diff --git a/src/alignment/rescorediagonal.cpp b/src/alignment/rescorediagonal.cpp index 4b3960c..de9f326 100644 --- a/src/alignment/rescorediagonal.cpp +++ b/src/alignment/rescorediagonal.cpp @@ -66,6 +66,14 @@ int doRescorediagonal(Parameters &par, qdbr = qDbrIdx->sequenceReader; querySeqType = qdbr->getDbtype(); } + if (par.wrappedScoring) + { + if(!Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)){ + Debug(Debug::ERROR) << "Wrapped scoring is only supported for nucleotides.\n"; + EXIT(EXIT_FAILURE); + } + } + if(resultReader.isSortedByOffset() && qdbr->isSortedByOffset()){ qdbr->setSequentialAdvice(); @@ -137,12 +145,22 @@ int doRescorediagonal(Parameters &par, size_t queryKey = resultReader.getDbKey(id); char *querySeq = NULL; + std::string queryToWrap; // needed only for wrapped end-start scoring unsigned int queryId = UINT_MAX; - int queryLen = -1; + int queryLen = -1, origQueryLen = -1; if(*data != '\0'){ queryId = qdbr->getId(queryKey); querySeq = qdbr->getData(queryId, thread_idx); queryLen = static_cast(qdbr->getSeqLen(queryId)); + origQueryLen = queryLen; + + if (par.wrappedScoring){ + queryToWrap = std::string(querySeq,queryLen); + queryToWrap = queryToWrap + queryToWrap; + querySeq = (char*)(queryToWrap).c_str(); + queryLen = origQueryLen*2; + } + if(queryLen > queryRevSeqLen){ delete [] queryRevSeq; queryRevSeq = new char[queryLen]; @@ -161,7 +179,6 @@ int doRescorediagonal(Parameters &par, querySeq = (char *) queryBuffer.c_str(); } } - // if(par.rescoreMode != Parameters::RESCORE_MODE_HAMMING){ // query.mapSequence(id, queryId, querySeq); // queryLen = query.L; @@ -185,14 +202,28 @@ int doRescorediagonal(Parameters &par, char *targetSeq = tdbr->getData(targetId, thread_idx); int dbLen = static_cast(tdbr->getSeqLen(targetId)); - float queryLength = static_cast(queryLen); + float queryLength = static_cast(origQueryLen); float targetLength = static_cast(dbLen); if (Util::canBeCovered(par.covThr, par.covMode, queryLength, targetLength) == false) { continue; } - DistanceCalculator::LocalAlignment alignment = DistanceCalculator::computeUngappedAlignment( - querySeqToAlign, queryLen, targetSeq, targetLength, - results[entryIdx].diagonal, fastMatrix.matrix, par.rescoreMode); + DistanceCalculator::LocalAlignment alignment; + if (par.wrappedScoring) { + if (dbLen > origQueryLen) { + Debug(Debug::WARNING) << "WARNING: target sequence " << targetId + << " is skipped, no valid wrapped scoring possible\n"; + continue; + } + + alignment = DistanceCalculator::computeUngappedWrappedAlignment( + querySeqToAlign, queryLen, targetSeq, targetLength, + results[entryIdx].diagonal, fastMatrix.matrix, par.rescoreMode); + } + else { + alignment = DistanceCalculator::computeUngappedAlignment( + querySeqToAlign, queryLen, targetSeq, targetLength, + results[entryIdx].diagonal, fastMatrix.matrix, par.rescoreMode); + } unsigned int distanceToDiagonal = alignment.distToDiagonal; int diagonalLen = alignment.diagonalLen; int distance = alignment.score; @@ -202,18 +233,18 @@ int doRescorediagonal(Parameters &par, int bitScore = 0; int alnLen = 0; float targetCov = static_cast(diagonalLen) / static_cast(dbLen); - float queryCov = static_cast(diagonalLen) / static_cast(queryLen); + float queryCov = static_cast(diagonalLen) / static_cast(origQueryLen); Matcher::result_t result; if (par.rescoreMode == Parameters::RESCORE_MODE_HAMMING) { int idCnt = (static_cast(distance)); - seqId = Util::computeSeqId(par.seqIdMode, idCnt, queryLen, dbLen, diagonalLen); + seqId = Util::computeSeqId(par.seqIdMode, idCnt, origQueryLen, dbLen, diagonalLen); alnLen = diagonalLen; } else if (par.rescoreMode == Parameters::RESCORE_MODE_SUBSTITUTION || par.rescoreMode == Parameters::RESCORE_MODE_ALIGNMENT || par.rescoreMode == Parameters::RESCORE_MODE_GLOBAL_ALIGNMENT || par.rescoreMode == Parameters::RESCORE_MODE_WINDOW_QUALITY_ALIGNMENT) { - evalue = evaluer.computeEvalue(distance, queryLen); + evalue = evaluer.computeEvalue(distance, origQueryLen); bitScore = static_cast(evaluer.computeBitScore(distance) + 0.5); if (par.rescoreMode == Parameters::RESCORE_MODE_ALIGNMENT|| @@ -245,7 +276,7 @@ int doRescorediagonal(Parameters &par, char tLetter = targetSeq[dbStartPos + (i - qStartPos)] & static_cast(~0x20); idCnt += (qLetter == tLetter) ? 1 : 0; } - seqId = Util::computeSeqId(par.seqIdMode, idCnt, queryLen, dbLen, alnLen); + seqId = Util::computeSeqId(par.seqIdMode, idCnt, origQueryLen, dbLen, alnLen); } char *end = Itoa::i32toa_sse2(alnLen, buffer); size_t len = end - buffer; @@ -254,13 +285,14 @@ int doRescorediagonal(Parameters &par, backtrace=std::string(buffer, len - 1); backtrace.push_back('M'); } - queryCov = SmithWaterman::computeCov(qStartPos, qEndPos, queryLen); + queryCov = SmithWaterman::computeCov(qStartPos, qEndPos, origQueryLen); targetCov = SmithWaterman::computeCov(dbStartPos, dbEndPos, dbLen); if (isReverse) { qStartPos = queryLen - qStartPos - 1; qEndPos = queryLen - qEndPos - 1; } - result = Matcher::result_t(results[entryIdx].seqId, bitScore, queryCov, targetCov, seqId, evalue, alnLen, qStartPos, qEndPos, queryLen, dbStartPos, dbEndPos, dbLen, backtrace); + result = Matcher::result_t(results[entryIdx].seqId, bitScore, queryCov, targetCov, seqId, evalue, alnLen, + qStartPos, qEndPos, origQueryLen, dbStartPos, dbEndPos, dbLen, backtrace); } } @@ -347,6 +379,11 @@ int rescorediagonal(int argc, const char **argv, const Command &command) { Parameters &par = Parameters::getInstance(); par.parseParameters(argc, argv, command, true, 0, 0); + if (par.wrappedScoring && par.rescoreMode != Parameters::RESCORE_MODE_HAMMING) { + Debug(Debug::ERROR) << "ERROR: wrapped scoring is only allowed with RESCORE_MODE_HAMMING\n"; + return EXIT_FAILURE; + } + DBReader resultReader(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); resultReader.open(DBReader::LINEAR_ACCCESS); diff --git a/src/clustering/CMakeLists.txt b/src/clustering/CMakeLists.txt index 85c4988..f39a2f4 100644 --- a/src/clustering/CMakeLists.txt +++ b/src/clustering/CMakeLists.txt @@ -3,7 +3,6 @@ set(clustering_header_files clustering/Clustering.h clustering/ClusteringAlgorithms.h clustering/Main.cpp - clustering/SetElement.h PARENT_SCOPE ) diff --git a/src/clustering/Clustering.h b/src/clustering/Clustering.h index e40ab3b..1822694 100644 --- a/src/clustering/Clustering.h +++ b/src/clustering/Clustering.h @@ -7,7 +7,6 @@ #include "DBReader.h" #include "DBWriter.h" -#include "SetElement.h" class Clustering { public: diff --git a/src/clustering/ClusteringAlgorithms.h b/src/clustering/ClusteringAlgorithms.h index d9dab7f..85b7b10 100644 --- a/src/clustering/ClusteringAlgorithms.h +++ b/src/clustering/ClusteringAlgorithms.h @@ -11,7 +11,6 @@ #include #include "DBReader.h" -#include "SetElement.h" class ClusteringAlgorithms { public: diff --git a/src/clustering/SetElement.h b/src/clustering/SetElement.h deleted file mode 100644 index 0fb77f6..0000000 --- a/src/clustering/SetElement.h +++ /dev/null @@ -1,27 +0,0 @@ -// -// element.h -// graphcluster -// -// Created by Martin Steinegger on 21.05.13. -// Copyright (c) 2013 Martin Steinegger. All rights reserved. -// - -#ifndef GRAPHCLUSTER_ELEMENT -#define GRAPHCLUSTER_ELEMENT - -struct set { - struct element { - element * last; - element * next; - unsigned short weight; - - }; - element * elements; - set * last; - set * next; - unsigned short weight; -}; - - - -#endif diff --git a/src/commons/Command.h b/src/commons/Command.h index d198d9a..b02a565 100644 --- a/src/commons/Command.h +++ b/src/commons/Command.h @@ -3,12 +3,16 @@ #include -const int CITATION_MMSEQS2 = 1 << 0; -const int CITATION_MMSEQS1 = 1 << 1; -const int CITATION_UNICLUST = 1 << 2; -const int CITATION_LINCLUST = 1 << 3; -const int CITATION_PLASS = 1 << 4; -const int CITATION_SERVER = 1 << 5; +const unsigned int CITATION_MMSEQS2 = 1 << 0; +const unsigned int CITATION_MMSEQS1 = 1 << 1; +const unsigned int CITATION_UNICLUST = 1 << 2; +const unsigned int CITATION_LINCLUST = 1 << 3; +const unsigned int CITATION_PLASS = 1 << 4; +const unsigned int CITATION_SERVER = 1 << 5; + +// Make sure this is always the last bit +// citations from inheriting tools will start from here +const unsigned int CITATION_END = CITATION_SERVER << 1; struct MMseqsParameter; @@ -77,7 +81,7 @@ struct Command { const char *longDescription; const char *author; const char *usage; - int citations; + unsigned int citations; std::vector databases; }; diff --git a/src/commons/DBReader.cpp b/src/commons/DBReader.cpp index 4ba1717..01fbf54 100644 --- a/src/commons/DBReader.cpp +++ b/src/commons/DBReader.cpp @@ -134,7 +134,7 @@ template bool DBReader::open(int accessType){ if (dataMode & USE_LOOKUP || dataMode & USE_LOOKUP_REV) { std::string lookupFilename = (std::string(dataFileName) + ".lookup"); if(FileUtil::fileExists(lookupFilename.c_str()) == false){ - Debug(Debug::ERROR) << "Can not open index file " << lookupFilename << "!\n"; + Debug(Debug::ERROR) << "Can not open lookup file " << lookupFilename << "!\n"; EXIT(EXIT_FAILURE); } MemoryMapped indexData(lookupFilename, MemoryMapped::WholeFile, MemoryMapped::SequentialScan); @@ -158,7 +158,7 @@ template bool DBReader::open(int accessType){ } MemoryMapped indexData(indexFileName, MemoryMapped::WholeFile, MemoryMapped::SequentialScan); if (!indexData.isValid()){ - Debug(Debug::ERROR) << "Can not open index file " << indexFileName << "\n"; + Debug(Debug::ERROR) << "Can map open index file " << indexFileName << "\n"; EXIT(EXIT_FAILURE); } char* indexDataChar = (char *) indexData.getData(); @@ -478,7 +478,7 @@ template size_t DBReader::bsearch(const Index * index, size_t N, { Index val; val.id = value; - return std::upper_bound(index, index + N, val, Index::compareById) - index; + return std::upper_bound(index, index + N, val, Index::compareByIdOnly) - index; } template char* DBReader::getDataCompressed(size_t id, int thrIdx) { @@ -614,7 +614,7 @@ template size_t DBReader::getLookupIdByKey(T dbKey) { } LookupEntry val; val.id = dbKey; - size_t id = std::upper_bound(lookup, lookup + lookupSize, val, LookupEntry::compareById) - lookup; + size_t id = std::upper_bound(lookup, lookup + lookupSize, val, LookupEntry::compareByIdOnly) - lookup; return (id < lookupSize && lookup[id].id == dbKey) ? id : SIZE_MAX; } @@ -908,7 +908,7 @@ template size_t DBReader::getOffset(size_t id) { if (id >= size){ Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << indexFileName << "\n"; - Debug(Debug::ERROR) << "getDbKey: local id (" << id << ") >= db size (" << size << ")\n"; + Debug(Debug::ERROR) << "getOffset: local id (" << id << ") >= db size (" << size << ")\n"; EXIT(EXIT_FAILURE); } if (local2id != NULL) { diff --git a/src/commons/DBReader.h b/src/commons/DBReader.h index 471c78f..a10b173 100644 --- a/src/commons/DBReader.h +++ b/src/commons/DBReader.h @@ -57,23 +57,73 @@ class DBReader { T id; size_t offset; unsigned int length; - static bool compareById(const Index& x, const Index& y){ - return (x.id <= y.id); + + // we need a non-strict-weak ordering function here + // so our upper_bound call works correctly + static bool compareByIdOnly(const Index &x, const Index &y) { + return x.id <= y.id; } - static bool compareByOffset(const Index& x, const Index& y){ - return (x.offset <= y.offset); + + static bool compareById(const Index &x, const Index &y) { + if (x.id < y.id) + return true; + if (y.id < x.id) + return false; + if (x.offset < y.offset) + return true; + if (y.offset < x.offset) + return false; + if (x.length < y.length) + return true; + if (y.length < x.length) + return false; + return false; } - }; + static bool compareByOffset(const Index &x, const Index &y) { + if (x.offset < y.offset) + return true; + if (y.offset < x.offset) + return false; + if (x.id < y.id) + return true; + if (y.id < x.id) + return false; + if (x.length < y.length) + return true; + if (y.length < x.length) + return false; + return false; + } + }; struct LookupEntry { T id; std::string entryName; unsigned int fileNumber; - static bool compareById(const LookupEntry& x, const LookupEntry& y){ - return (x.id <= y.id); + // we need a non-strict-weak ordering function here + // so our upper_bound call works correctly + static bool compareByIdOnly(const LookupEntry& x, const LookupEntry& y) { + return x.id <= y.id; } + + static bool compareById(const LookupEntry& x, const LookupEntry& y) { + if (x.id < y.id) + return true; + if (y.id < x.id) + return false; + if (x.entryName < y.entryName) + return true; + if (y.entryName < x.entryName) + return false; + if (x.fileNumber < y.fileNumber) + return true; + if (y.fileNumber < x.fileNumber) + return false; + return false; + } + static bool compareByAccession(const LookupEntry& x, const LookupEntry& y){ return x.entryName.compare(y.entryName); } @@ -100,6 +150,7 @@ class DBReader { size_t getAminoAcidDBSize(); size_t getDataSize() { return dataSize; } + void setDataSize(size_t newSize) { dataSize = newSize; } char* getData(size_t id, int thrIdx); @@ -115,6 +166,8 @@ class DBReader { size_t getSize(); + unsigned int getMaxSeqLen(){return maxSeqLen;} + T getDbKey(size_t id); @@ -144,7 +197,7 @@ class DBReader { size_t getEntryLen(size_t id){ if (id >= size){ Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << indexFileName << "\n"; - Debug(Debug::ERROR) << "getSeqLen: local id (" << id << ") >= db size (" << size << ")\n"; + Debug(Debug::ERROR) << "getEntryLen: local id (" << id << ") >= db size (" << size << ")\n"; EXIT(EXIT_FAILURE); } if (local2id != NULL) { diff --git a/src/commons/DBWriter.cpp b/src/commons/DBWriter.cpp index 0ff113a..cb19474 100644 --- a/src/commons/DBWriter.cpp +++ b/src/commons/DBWriter.cpp @@ -93,8 +93,6 @@ DBWriter::~DBWriter() { } void DBWriter::sortDatafileByIdOrder(DBReader &dbr) { - Debug(Debug::INFO) << "Sorting the results... " << dataFileName << " .. "; - #pragma omp parallel { int thread_idx = 0; @@ -609,7 +607,7 @@ void DBWriter::mergeResults(const char *outFileName, const char *outFileNameInde DBWriter::sortIndex(indexFileNames[0], outFileNameIndex, lexicographicOrder); FileUtil::remove(indexFileNames[0]); - Debug(Debug::INFO) << "Time for merging into " << outFileName << " by mergeResults: " << timer.lap() << "\n"; + Debug(Debug::INFO) << "Time for merging to " << FileUtil::baseName(outFileName) << ": " << timer.lap() << "\n"; } void DBWriter::mergeIndex(const char** indexFilenames, unsigned int fileCount, const std::vector &dataSizes) { diff --git a/src/commons/DBWriter.h b/src/commons/DBWriter.h index 09cf4c8..575faa6 100644 --- a/src/commons/DBWriter.h +++ b/src/commons/DBWriter.h @@ -69,6 +69,9 @@ class DBWriter { static void createRenumberedDB(const std::string& dataFile, const std::string& indexFile, const std::string& lookupFile, int sortMode = DBReader::SORT_BY_ID_OFFSET); + bool isClosed(){ + return closed; + } private: size_t addToThreadBuffer(const void *data, size_t itmesize, size_t nitems, int threadIdx); void writeThreadBuffer(unsigned int idx, size_t dataSize); diff --git a/src/commons/FileUtil.cpp b/src/commons/FileUtil.cpp index ceafbea..a83abb7 100644 --- a/src/commons/FileUtil.cpp +++ b/src/commons/FileUtil.cpp @@ -139,6 +139,19 @@ size_t FileUtil::getFreeSpace(const char *path) { return stat.f_bfree * stat.f_frsize; } + +std::string FileUtil::getRealPathFromSymLink(const std::string path){ + char *p = realpath(path.c_str(), NULL); + if (p == NULL) { + Debug(Debug::ERROR) << "Could not get path of " << path << "!\n"; + EXIT(EXIT_FAILURE); + } + + std::string name(p); + free(p); + return name; +} + std::string FileUtil::getHashFromSymLink(const std::string path){ char *p = realpath(path.c_str(), NULL); if (p == NULL) { diff --git a/src/commons/FileUtil.h b/src/commons/FileUtil.h index e5abfd7..8237ebf 100644 --- a/src/commons/FileUtil.h +++ b/src/commons/FileUtil.h @@ -24,6 +24,8 @@ class FileUtil { static void deleteTempFiles(const std::list &tmpFiles); + static std::string getRealPathFromSymLink(const std::string path); + static std::string getHashFromSymLink(const std::string path); static void* mmapFile(FILE * file, size_t *dataSize); diff --git a/src/commons/KSeqWrapper.cpp b/src/commons/KSeqWrapper.cpp index bb4e596..4407175 100644 --- a/src/commons/KSeqWrapper.cpp +++ b/src/commons/KSeqWrapper.cpp @@ -19,7 +19,8 @@ bool KSeqFile::ReadEntry() { int result = KSEQFILE::kseq_read(s); if (result < 0) return false; - + entry.offset = s->offset; + entry.multiline = s->multiline; entry.name = s->name; entry.comment = s->comment; entry.sequence = s->seq; @@ -63,6 +64,8 @@ bool KSeqGzip::ReadEntry() { entry.comment = s->comment; entry.sequence = s->seq; entry.qual = s->qual; + entry.offset = -1; + entry.multiline = s->multiline; return true; } @@ -105,6 +108,8 @@ bool KSeqBzip::ReadEntry() { entry.comment = s->comment; entry.sequence = s->seq; entry.qual = s->qual; + entry.offset = -1; + entry.multiline = s->multiline; return true; } diff --git a/src/commons/KSeqWrapper.h b/src/commons/KSeqWrapper.h index b358a4a..7a56525 100644 --- a/src/commons/KSeqWrapper.h +++ b/src/commons/KSeqWrapper.h @@ -12,6 +12,8 @@ class KSeqWrapper { kstring_t sequence; kstring_t comment; kstring_t qual; + size_t offset; + bool multiline; } entry; virtual bool ReadEntry() = 0; diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index 31ec2a4..2f1d0d0 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -5,13 +5,12 @@ #include "CommandCaller.h" #include "ByteParser.h" #include "FileUtil.h" -#include +#include #include #include #include - #ifdef __CYGWIN__ #include #endif @@ -88,10 +87,11 @@ Parameters::Parameters(): PARAM_PROFILE_TYPE(PARAM_PROFILE_TYPE_ID,"--profile-type", "Profile type", "0: HMM (HHsuite) 1: PSSM or 2: HMMER3",typeid(int),(void *) &profileMode, "^[0-2]{1}$"), // convertalignments PARAM_FORMAT_MODE(PARAM_FORMAT_MODE_ID,"--format-mode", "Alignment format", "Output format 0: BLAST-TAB, 1: SAM, 2: BLAST-TAB + query/db length", typeid(int), (void*) &formatAlignmentMode, "^[0-2]{1}$"), - PARAM_FORMAT_OUTPUT(PARAM_FORMAT_OUTPUT_ID,"--format-output", "Format alignment output", "Choose output columns 'query,target,evalue,gapopen,pident,nident,qstart,qend,qlen,tstart,tend,tlen,alnlen,raw,bits,cigar,qseq,tseq,qheader,theader,qaln,taln,qframe,tframe,mismatch,qcov,tcov,qset,qsetid,tset,tsetid'", typeid(std::string), (void*) &outfmt, ""), + PARAM_FORMAT_OUTPUT(PARAM_FORMAT_OUTPUT_ID,"--format-output", "Format alignment output", "Choose output columns 'query,target,evalue,gapopen,pident,nident,qstart,qend,qlen,tstart,tend,tlen,alnlen,raw,bits,cigar,qseq,tseq,qheader,theader,qaln,taln,qframe,tframe,mismatch,qcov,tcov,qset,qsetid,tset,tsetid,taxid,taxname,taxlineage'", typeid(std::string), (void*) &outfmt, ""), PARAM_DB_OUTPUT(PARAM_DB_OUTPUT_ID, "--db-output", "Database output", "Output a result db instead of a text file", typeid(bool), (void*) &dbOut, "", MMseqsParameter::COMMAND_EXPERT), // --include-only-extendablediagonal PARAM_RESCORE_MODE(PARAM_RESCORE_MODE_ID,"--rescore-mode", "Rescore mode", "Rescore diagonal with: 0: Hamming distance, 1: local alignment (score only), 2: local alignment, 3: global alignment or 4: longest alignment fullfilling window quality criterion", typeid(int), (void *) &rescoreMode, "^[0-4]{1}$"), + PARAM_WRAPPED_SCORING(PARAM_WRAPPED_SCORING_ID,"--wrapped-scoring", "Allow wrapped scoring","Double the (nucleotide) query sequence during the scoring process to allow wrapped diagonal scoring around end and start", typeid(bool), (void *) &wrappedScoring, "", MMseqsParameter::COMMAND_MISC|MMseqsParameter::COMMAND_EXPERT), PARAM_FILTER_HITS(PARAM_FILTER_HITS_ID,"--filter-hits", "Remove hits by seq. id. and coverage", "filter hits by seq.id. and coverage", typeid(bool), (void *) &filterHits, "", MMseqsParameter::COMMAND_EXPERT), PARAM_SORT_RESULTS(PARAM_SORT_RESULTS_ID, "--sort-results", "Sort results", "Sort results: 0: no sorting, 1: sort by evalue (Alignment) or seq.id. (Hamming)", typeid(int), (void *) &sortResults, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT), // result2msa @@ -134,7 +134,7 @@ Parameters::Parameters(): PARAM_KMER_PER_SEQ(PARAM_KMER_PER_SEQ_ID, "--kmer-per-seq", "K-mers per sequence", "kmer per sequence", typeid(int), (void*) &kmersPerSequence, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_CLUSTLINEAR), PARAM_KMER_PER_SEQ_SCALE(PARAM_KMER_PER_SEQ_SCALE_ID, "--kmer-per-seq-scale", "scale k-mers per sequence", "scale kmer per sequence based on sequence length as kmer-per-seq val + scale x seqlen", typeid(float), (void*) &kmersPerSequenceScale, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_EXPERT), PARAM_INCLUDE_ONLY_EXTENDABLE(PARAM_INCLUDE_ONLY_EXTENDABLE_ID, "--include-only-extendable", "Include only extendable", "Include only extendable", typeid(bool), (void*) &includeOnlyExtendable, "", MMseqsParameter::COMMAND_CLUSTLINEAR), - PARAM_SKIP_N_REPEAT_KMER(PARAM_SKIP_N_REPEAT_KMER_ID, "--skip-n-repeat-kmer", "Skip sequence with n repeating k-mers", "Skip sequence with >= n exact repeating k-mers", typeid(int), (void*) &skipNRepeatKmer, "^[0-9]{1}[0-9]*", MMseqsParameter::COMMAND_CLUSTLINEAR|MMseqsParameter::COMMAND_EXPERT), + PARAM_IGNORE_MULTI_KMER(PARAM_IGNORE_MULTI_KMER_ID, "--ignore-multi-kmer", "Skip repeating k-mers", "Skip kmers occuring multiple times (>=2)", typeid(bool), (void*) &ignoreMultiKmer, "", MMseqsParameter::COMMAND_CLUSTLINEAR|MMseqsParameter::COMMAND_EXPERT), PARAM_HASH_SHIFT(PARAM_HASH_SHIFT_ID, "--hash-shift", "Shift hash", "Shift k-mer hash", typeid(int), (void*) &hashShift, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_CLUSTLINEAR|MMseqsParameter::COMMAND_EXPERT), PARAM_PICK_N_SIMILAR(PARAM_HASH_SHIFT_ID, "--pick-n-sim-kmer", "Add N similar to search", "adds N similar to search", typeid(int), (void*) &pickNbest, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_CLUSTLINEAR|MMseqsParameter::COMMAND_EXPERT), PARAM_ADJUST_KMER_LEN(PARAM_ADJUST_KMER_LEN_ID, "--adjust-kmer-len", "Adjust k-mer length", "adjust k-mer length based on specificity (only for nucleotides)", typeid(bool), (void*) &adjustKmerLength, "", MMseqsParameter::COMMAND_CLUSTLINEAR|MMseqsParameter::COMMAND_EXPERT), @@ -169,12 +169,12 @@ Parameters::Parameters(): PARAM_USE_HEADER(PARAM_USE_HEADER_ID,"--use-fasta-header", "Use fasta header", "use the id parsed from the fasta header as the index key instead of using incrementing numeric identifiers",typeid(bool),(void *) &useHeader, ""), PARAM_ID_OFFSET(PARAM_ID_OFFSET_ID, "--id-offset", "Offset of numeric ids", "numeric ids in index file are offset by this value ",typeid(int),(void *) &identifierOffset, "^(0|[1-9]{1}[0-9]*)$"), PARAM_DB_TYPE(PARAM_DB_TYPE_ID,"--dbtype", "Database type", "Database type 0: auto, 1: amino acid 2: nucleotides",typeid(int),(void *) &dbType, "[0-2]{1}"), - PARAM_DONT_SPLIT_SEQ_BY_LEN(PARAM_DONT_SPLIT_SEQ_BY_LEN_ID,"--dont-split-seq-by-len", "Split seq. by length", "Dont split sequences by --max-seq-len",typeid(bool),(void *) &splitSeqByLen, ""), - PARAM_DONT_SHUFFLE(PARAM_DONT_SHUFFLE_ID,"--dont-shuffle", "Do not shuffle input database", "Do not shuffle input database",typeid(bool),(void *) &shuffleDatabase, ""), + PARAM_CREATEDB_MODE(PARAM_CREATEDB_MODE_ID, "--createdb-mode", "Createdb mode", "createdb mode 0: copy data, 1: soft link data and write new index (works only with single line fasta/q)",typeid(int),(void *) &createdbMode, "^[0-1]{1}$"), + PARAM_SHUFFLE(PARAM_SHUFFLE_ID,"--shuffle", "Shuffle input database", "Shuffle input database",typeid(bool),(void *) &shuffleDatabase, ""), PARAM_USE_HEADER_FILE(PARAM_USE_HEADER_FILE_ID, "--use-header-file", "Use ffindex header", "use the ffindex header file instead of the body to map the entry keys",typeid(bool),(void *) &useHeaderFile, ""), // splitsequence PARAM_SEQUENCE_OVERLAP(PARAM_SEQUENCE_OVERLAP_ID, "--sequence-overlap", "Overlap between sequences", "overlap between sequences",typeid(int),(void *) &sequenceOverlap, "^(0|[1-9]{1}[0-9]*)$"), - PARAM_SEQUENCE_SPLIT_MODE(PARAM_SEQUENCE_SPLIT_MODE_ID, "--sequence-split-mode", "Sequence split mode", "sequence split mode 0: soft link data write new index, 1: copy data",typeid(int),(void *) &sequenceSplitMode, "^[0-1]{1}$"), + PARAM_SEQUENCE_SPLIT_MODE(PARAM_SEQUENCE_SPLIT_MODE_ID, "--sequence-split-mode", "Sequence split mode", "sequence split mode 0: copy data, 1: soft link data and write new index,",typeid(int),(void *) &sequenceSplitMode, "^[0-1]{1}$"), // gff2db PARAM_GFF_TYPE(PARAM_GFF_TYPE_ID,"--gff-type", "GFF type", "type in the GFF file to filter by",typeid(std::string),(void *) &gffType, ""), // translatenucs @@ -243,6 +243,8 @@ Parameters::Parameters(): PARAM_LCA_RANKS(PARAM_LCA_RANKS_ID, "--lca-ranks", "LCA ranks", "Add column with specified ranks (':' separated)", typeid(std::string), (void*) &lcaRanks, ""), PARAM_BLACKLIST(PARAM_BLACKLIST_ID, "--blacklist", "Blacklisted taxa", "Comma separated list of ignored taxa in LCA computation", typeid(std::string), (void*)&blacklist, "([0-9]+,)?[0-9]+"), PARAM_TAXON_ADD_LINEAGE(PARAM_TAXON_ADD_LINEAGE_ID, "--tax-lineage", "Show taxon lineage", "Add column with full taxonomy lineage", typeid(bool), (void*)&showTaxLineage, ""), + // taxonomyreport + PARAM_REPORT_MODE(PARAM_REPORT_MODE_ID,"--report-mode", "Report mode", "Taxonomy report mode 0: Kraken 1: Krona", typeid(int), (void *) &reportMode, "^[0-1]{1}$"), // createtaxcb PARAM_NCBI_TAX_DUMP(PARAM_NCBI_TAX_DUMP_ID, "--ncbi-tax-dump", "NCBI tax dump directory", "NCBI tax dump directory. The tax dump can be downloaded here \"ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz\"", typeid(std::string), (void*) &ncbiTaxDump, ""), PARAM_TAX_MAPPING_FILE(PARAM_TAX_MAPPING_FILE_ID, "--tax-mapping-file", "Taxonomical mapping file", "File to map sequence identifer to taxonomical identifier", typeid(std::string), (void*) &taxMappingFile, ""), @@ -252,7 +254,7 @@ Parameters::Parameters(): PARAM_LCA_MODE(PARAM_LCA_MODE_ID, "--lca-mode", "LCA mode", "LCA Mode 1: Single Search LCA , 2: 2bLCA, 3: approx. 2bLCA, 4: top hit", typeid(int), (void*) &taxonomySearchMode, "^[1-4]{1}$"), PARAM_TAX_OUTPUT_MODE(PARAM_TAX_OUTPUT_MODE_ID, "--tax-output-mode", "Taxonomy output mode", "0: output LCA, 1: output alignment", typeid(int), (void*) &taxonomyOutpuMode, "^[0-1]{1}$"), // createsubdb - PARAM_SUBDB_MODE(PARAM_SUBDB_MODE_ID, "--subdb-mode", "Subdb mode", "LCA Mode 0: copy data 1: soft link data", typeid(int), (void*) &subDbMode, "^[0-1]{1}$") + PARAM_SUBDB_MODE(PARAM_SUBDB_MODE_ID, "--subdb-mode", "Subdb mode", "Subdb mode 0: copy data 1: soft link data and write index", typeid(int), (void*) &subDbMode, "^[0-1]{1}$") { if (instance) { Debug(Debug::ERROR) << "Parameter instance already exists!\n"; @@ -280,6 +282,7 @@ Parameters::Parameters(): align.push_back(&PARAM_SUB_MAT); align.push_back(&PARAM_ADD_BACKTRACE); align.push_back(&PARAM_ALIGNMENT_MODE); + align.push_back(&PARAM_WRAPPED_SCORING); align.push_back(&PARAM_E); align.push_back(&PARAM_MIN_SEQ_ID); align.push_back(&PARAM_MIN_ALN_LEN); @@ -356,6 +359,7 @@ Parameters::Parameters(): // rescorediagonal rescorediagonal.push_back(&PARAM_SUB_MAT); rescorediagonal.push_back(&PARAM_RESCORE_MODE); + rescorediagonal.push_back(&PARAM_WRAPPED_SCORING); rescorediagonal.push_back(&PARAM_FILTER_HITS); rescorediagonal.push_back(&PARAM_E); rescorediagonal.push_back(&PARAM_C); @@ -418,6 +422,7 @@ Parameters::Parameters(): result2profile.push_back(&PARAM_E_PROFILE); result2profile.push_back(&PARAM_NO_COMP_BIAS_CORR); result2profile.push_back(&PARAM_WG); + result2profile.push_back(&PARAM_ALLOW_DELETION); result2profile.push_back(&PARAM_FILTER_MSA); result2profile.push_back(&PARAM_FILTER_MAX_SEQ_ID); result2profile.push_back(&PARAM_FILTER_QID); @@ -639,7 +644,7 @@ Parameters::Parameters(): kmerindexdb.push_back(&PARAM_MIN_SEQ_ID); kmerindexdb.push_back(&PARAM_ADJUST_KMER_LEN); kmerindexdb.push_back(&PARAM_SPLIT_MEMORY_LIMIT); - kmerindexdb.push_back(&PARAM_SKIP_N_REPEAT_KMER); + kmerindexdb.push_back(&PARAM_IGNORE_MULTI_KMER); kmerindexdb.push_back(&PARAM_ALPH_SIZE); kmerindexdb.push_back(&PARAM_MAX_SEQ_LEN); kmerindexdb.push_back(&PARAM_MASK_RESIDUES); @@ -653,9 +658,9 @@ Parameters::Parameters(): // create db createdb.push_back(&PARAM_MAX_SEQ_LEN); - createdb.push_back(&PARAM_DONT_SPLIT_SEQ_BY_LEN); createdb.push_back(&PARAM_DB_TYPE); - createdb.push_back(&PARAM_DONT_SHUFFLE); + createdb.push_back(&PARAM_SHUFFLE); + createdb.push_back(&PARAM_CREATEDB_MODE); createdb.push_back(&PARAM_ID_OFFSET); createdb.push_back(&PARAM_COMPRESSED); createdb.push_back(&PARAM_V); @@ -791,7 +796,7 @@ Parameters::Parameters(): kmermatcher.push_back(&PARAM_HASH_SHIFT); kmermatcher.push_back(&PARAM_SPLIT_MEMORY_LIMIT); kmermatcher.push_back(&PARAM_INCLUDE_ONLY_EXTENDABLE); - kmermatcher.push_back(&PARAM_SKIP_N_REPEAT_KMER); + kmermatcher.push_back(&PARAM_IGNORE_MULTI_KMER); kmermatcher.push_back(&PARAM_THREADS); kmermatcher.push_back(&PARAM_COMPRESSED); kmermatcher.push_back(&PARAM_V); @@ -918,6 +923,11 @@ Parameters::Parameters(): addtaxonomy.push_back(&PARAM_THREADS); addtaxonomy.push_back(&PARAM_V); + // taxonomyreport + taxonomyreport.push_back(&PARAM_REPORT_MODE); + taxonomyreport.push_back(&PARAM_THREADS); + taxonomyreport.push_back(&PARAM_V); + // view view.push_back(&PARAM_ID_LIST); view.push_back(&PARAM_IDX_ENTRY_TYPE); @@ -1032,6 +1042,7 @@ Parameters::Parameters(): easytaxonomy = combineList(taxonomy, addtaxonomy); easytaxonomy = combineList(easytaxonomy, convertalignments); easytaxonomy = combineList(easytaxonomy, createtsv); + easytaxonomy = combineList(easytaxonomy, createdb); // multi hit db multihitdb = combineList(createdb, extractorfs); @@ -1187,26 +1198,13 @@ void Parameters::printUsageMessage(const Command& command, } if (printExpert == false) { ss << "\n" << "An extended list of options can be obtained by calling '" << binary_name << " " << command.cmd << " -h'.\n"; - if(command.citations > 0) { - //ss << "Please cite:\n"; - if(command.citations & CITATION_SERVER) { - ss << " - Mirdita M, Steinegger M, Soding J: MMseqs2 desktop and local web server app for fast, interactive sequence searches. Bioinformatics, doi: 10.1093/bioinformatics/bty1057 (2019).\n"; - } - if(command.citations & CITATION_PLASS) { - ss << " - Steinegger M, Mirdita M, Soding J: Protein-level assembly increases protein sequence recovery from metagenomic samples manyfold. biorxiv, doi:10.1101/386110 (2018)\n"; - } - if(command.citations & CITATION_LINCLUST) { - ss << " - Steinegger M, Soding J: Clustering huge protein sequence sets in linear time. Nature Communications, doi:10.1038/s41467-018-04964-5 (2018)\n"; - } - if(command.citations & CITATION_MMSEQS1) { - ss << " - Hauser M, Steinegger M, Soding J: MMseqs software suite for fast and deep clustering and searching of large protein sequence sets. Bioinformatics, 32(9), 1323-1330 (2016). \n"; - } - if(command.citations & CITATION_UNICLUST) { - ss << " - Mirdita M, von den Driesch L, Galiez C, Martin M, Soding J, Steinegger M: Uniclust databases of clustered and deeply annotated protein sequences and alignments. Nucleic Acids Res (2017), D170-D176 (2016).\n"; - } - if(command.citations & CITATION_MMSEQS2) { - ss << " - Steinegger M, Soding J: MMseqs2 enables sensitive protein sequence searching for the analysis of massive data sets. Nature Biotechnology, doi:10.1038/nbt.3988 (2017)\n"; + ss << "\nReferences:\n"; + for (unsigned int pos = 0 ; pos != sizeof(command.citations) * CHAR_BIT; ++pos) { + unsigned int citation = 1 << pos; + if (command.citations & citation && citations.find(citation) != citations.end()) { + ss << " - " << citations.at(citation) << "\n"; + } } } } @@ -1546,6 +1544,29 @@ void Parameters::parseParameters(int argc, const char *pargv[], const Command &c } } +void Parameters::checkIfTaxDbIsComplete(std::string & filename){ + if (FileUtil::fileExists((filename + "_mapping").c_str()) == false) { + Debug(Debug::ERROR) << "Database " << filename << " need taxonomical information.\n" + << "The " << filename << "_mapping is missing.\n"; + EXIT(EXIT_FAILURE); + } + if (FileUtil::fileExists((filename + "_nodes.dmp").c_str()) == false) { + Debug(Debug::ERROR) << "Database " << filename << " need taxonomical information.\n" + << "The " << filename << "_nodes.dmp is missing.\n"; + EXIT(EXIT_FAILURE); + } + if (FileUtil::fileExists((filename + "_names.dmp").c_str()) == false) { + Debug(Debug::ERROR) << "Database " << filename << " need taxonomical information.\n" + << "The " << filename << "_names.dmp is missing.\n"; + EXIT(EXIT_FAILURE); + } + if (FileUtil::fileExists((filename + "_merged.dmp").c_str()) == false) { + Debug(Debug::ERROR) << "Database " << filename << " need taxonomical information.\n" + << "The " << filename << "_merged.dmp is missing.\n"; + EXIT(EXIT_FAILURE); + } +} + void Parameters::checkIfDatabaseIsValid(const Command& command, bool isStartVar, bool isEndVar) { size_t fileIdx = 0; for (size_t dbIdx = 0; dbIdx < command.databases.size(); dbIdx++) { @@ -1582,26 +1603,7 @@ void Parameters::checkIfDatabaseIsValid(const Command& command, bool isStartVar, } } if (db.specialType & DbType::NEED_TAXONOMY) { - if (FileUtil::fileExists((filenames[fileIdx] + "_mapping").c_str()) == false) { - Debug(Debug::ERROR) << "Database " << filenames[fileIdx] << " need taxonomical information.\n" - << "The " << filenames[fileIdx] << "_mapping is missing.\n"; - EXIT(EXIT_FAILURE); - } - if (FileUtil::fileExists((filenames[fileIdx] + "_nodes.dmp").c_str()) == false) { - Debug(Debug::ERROR) << "Database " << filenames[fileIdx] << " need taxonomical information.\n" - << "The " << filenames[fileIdx] << "_nodes.dmp is missing.\n"; - EXIT(EXIT_FAILURE); - } - if (FileUtil::fileExists((filenames[fileIdx] + "_names.dmp").c_str()) == false) { - Debug(Debug::ERROR) << "Database " << filenames[fileIdx] << " need taxonomical information.\n" - << "The " << filenames[fileIdx] << "_names.dmp is missing.\n"; - EXIT(EXIT_FAILURE); - } - if (FileUtil::fileExists((filenames[fileIdx] + "_merged.dmp").c_str()) == false) { - Debug(Debug::ERROR) << "Database " << filenames[fileIdx] << " need taxonomical information.\n" - << "The " << filenames[fileIdx] << "_merged.dmp is missing.\n"; - EXIT(EXIT_FAILURE); - } + checkIfTaxDbIsComplete(filenames[fileIdx]); } if (db.specialType & DbType::NEED_LOOKUP) { if (FileUtil::fileExists((filenames[fileIdx] + ".lookup").c_str()) == false) { @@ -1817,7 +1819,7 @@ void Parameters::setDefaults() { searchType = SEARCH_TYPE_AUTO; // createdb - splitSeqByLen = true; + createdbMode = SEQUENCE_SPLIT_MODE_HARD; shuffleDatabase = true; // format alignment @@ -1900,6 +1902,7 @@ void Parameters::setDefaults() { // rescorediagonal rescoreMode = Parameters::RESCORE_MODE_HAMMING; + wrappedScoring = false; filterHits = false; sortResults = false; @@ -1962,7 +1965,7 @@ void Parameters::setDefaults() { kmersPerSequence = 21; kmersPerSequenceScale = 0.0; includeOnlyExtendable = false; - skipNRepeatKmer = 0; + ignoreMultiKmer = false; hashShift = 5; pickNbest = 1; adjustKmerLength = false; @@ -2000,12 +2003,24 @@ void Parameters::setDefaults() { // https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=28384 blacklist = "12908,28384"; + // taxonomyreport + reportMode = 0; + // expandaln expansionMode = 1; // taxonomy taxonomySearchMode = Parameters::TAXONOMY_TOP_HIT; taxonomyOutpuMode = Parameters::TAXONOMY_OUTPUT_LCA; + + citations = { + { CITATION_MMSEQS1, "Hauser M, Steinegger M, Soding J: MMseqs software suite for fast and deep clustering and searching of large protein sequence sets. Bioinformatics, 32(9), 1323-1330 (2016)" }, + { CITATION_MMSEQS2, "Steinegger M, Soding J: MMseqs2 enables sensitive protein sequence searching for the analysis of massive data sets. Nature Biotechnology, 35(11), 1026-1028 (2017)" }, + { CITATION_UNICLUST, "Mirdita M, von den Driesch L, Galiez C, Martin M, Soding J, Steinegger M: Uniclust databases of clustered and deeply annotated protein sequences and alignments. Nucleic Acids Research 45(D1), D170-D176 (2017)" }, + { CITATION_LINCLUST, "Steinegger M, Soding J: Clustering huge protein sequence sets in linear time. Nature Communications, 9(1), 2542 (2018)" }, + { CITATION_PLASS, "Steinegger M, Mirdita M, Soding J: Protein-level assembly increases protein sequence recovery from metagenomic samples manyfold. Nature Methods, 16(7), 603-606 (2019)" }, + { CITATION_SERVER, "Mirdita M, Steinegger M, Soding J: MMseqs2 desktop and local web server app for fast, interactive sequence searches. Bioinformatics, 35(16), 2856–2858 (2019)" }, + }; } std::vector Parameters::combineList(const std::vector &par1, @@ -2122,7 +2137,8 @@ void Parameters::overrideParameterDescription(Command &command, const int uid, } -std::vector Parameters::getOutputFormat(const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders, bool &needLookup, bool &needSource) { +std::vector Parameters::getOutputFormat(const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders, + bool &needLookup, bool &needSource, bool &needTaxonomyMapping, bool &needTaxonomy) { std::vector outformatSplit = Util::split(outformat, ","); std::vector formatCodes; int code = 0; @@ -2158,6 +2174,9 @@ std::vector Parameters::getOutputFormat(const std::string &outformat, bool else if (outformatSplit[i].compare("qsetid") == 0){ needLookup = true; needSource = true; code = Parameters::OUTFMT_QSETID;} else if (outformatSplit[i].compare("tset") == 0){ needLookup = true; code = Parameters::OUTFMT_TSET;} else if (outformatSplit[i].compare("tsetid") == 0){ needLookup = true; needSource = true; code = Parameters::OUTFMT_TSETID;} + else if (outformatSplit[i].compare("taxid") == 0){ needTaxonomyMapping = true; code = Parameters::OUTFMT_TAXID;} + else if (outformatSplit[i].compare("taxname") == 0){ needTaxonomyMapping = true; needTaxonomy = true; code = Parameters::OUTFMT_TAXNAME;} + else if (outformatSplit[i].compare("taxlineage") == 0){ needTaxonomyMapping = true; needTaxonomy = true; code = Parameters::OUTFMT_TAXLIN;} else if (outformatSplit[i].compare("empty") == 0){ code = Parameters::OUTFMT_EMPTY;} else { Debug(Debug::ERROR) << "Format code " << outformatSplit[i] << " does not exist."; diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index 2f950b2..e86bd77 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -6,6 +6,7 @@ #define MMSEQS_PARAMETERS #include #include +#include #include #include #include @@ -135,8 +136,13 @@ class Parameters { static const int OUTFMT_QSETID = 29; static const int OUTFMT_TSET = 30; static const int OUTFMT_TSETID = 31; + static const int OUTFMT_TAXID = 32; + static const int OUTFMT_TAXNAME = 33; + static const int OUTFMT_TAXLIN = 34; - static std::vector getOutputFormat(const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders, bool &needLookup, bool &needSource); + + static std::vector getOutputFormat(const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders, + bool &needLookup, bool &needSource, bool &needTaxonomyMapping, bool &needTaxonomy); // convertprofiledb static const int PROFILE_MODE_HMM = 0; @@ -214,8 +220,8 @@ class Parameters { static const int SEQ_ID_LONG = 2; // seq. split mode - static const int SEQUENCE_SPLIT_MODE_SOFT = 0; - static const int SEQUENCE_SPLIT_MODE_HARD = 1; + static const int SEQUENCE_SPLIT_MODE_HARD = 0; + static const int SEQUENCE_SPLIT_MODE_SOFT = 1; // rescorediagonal static const int RESCORE_MODE_HAMMING = 0; @@ -396,6 +402,7 @@ class Parameters { // rescorediagonal int rescoreMode; + bool wrappedScoring; bool filterHits; bool globalAlignment; int sortResults; @@ -446,7 +453,7 @@ class Parameters { int kmersPerSequence; float kmersPerSequenceScale; bool includeOnlyExtendable; - int skipNRepeatKmer; + bool ignoreMultiKmer; int hashShift; int pickNbest; int adjustKmerLength; @@ -458,12 +465,13 @@ class Parameters { // createdb int identifierOffset; int dbType; - bool splitSeqByLen; + int createdbMode; bool shuffleDatabase; // splitsequence int sequenceOverlap; int sequenceSplitMode; + // convert2fasta bool useHeaderFile; @@ -556,6 +564,9 @@ class Parameters { bool showTaxLineage; std::string blacklist; + // taxonomyreport + int reportMode; + // createtaxdb std::string ncbiTaxDump; std::string taxMappingFile; @@ -570,6 +581,9 @@ class Parameters { // createsubdb int subDbMode; + // tool citations + std::map citations; + static Parameters& getInstance() { if (instance == NULL) { @@ -666,6 +680,7 @@ class Parameters { // rescoremode PARAMETER(PARAM_RESCORE_MODE) + PARAMETER(PARAM_WRAPPED_SCORING) PARAMETER(PARAM_FILTER_HITS) PARAMETER(PARAM_SORT_RESULTS) @@ -715,7 +730,7 @@ class Parameters { PARAMETER(PARAM_KMER_PER_SEQ) PARAMETER(PARAM_KMER_PER_SEQ_SCALE) PARAMETER(PARAM_INCLUDE_ONLY_EXTENDABLE) - PARAMETER(PARAM_SKIP_N_REPEAT_KMER) + PARAMETER(PARAM_IGNORE_MULTI_KMER) PARAMETER(PARAM_HASH_SHIFT) PARAMETER(PARAM_PICK_N_SIMILAR) PARAMETER(PARAM_ADJUST_KMER_LEN) @@ -756,8 +771,8 @@ class Parameters { PARAMETER(PARAM_USE_HEADER) // also used by extractorfs PARAMETER(PARAM_ID_OFFSET) // same PARAMETER(PARAM_DB_TYPE) - PARAMETER(PARAM_DONT_SPLIT_SEQ_BY_LEN) - PARAMETER(PARAM_DONT_SHUFFLE) + PARAMETER(PARAM_CREATEDB_MODE) + PARAMETER(PARAM_SHUFFLE) // convert2fasta PARAMETER(PARAM_USE_HEADER_FILE) @@ -854,6 +869,10 @@ class Parameters { PARAMETER(PARAM_LCA_RANKS) PARAMETER(PARAM_BLACKLIST) PARAMETER(PARAM_TAXON_ADD_LINEAGE) + + // taxonomyreport + PARAMETER(PARAM_REPORT_MODE) + // createtaxdb PARAMETER(PARAM_NCBI_TAX_DUMP) PARAMETER(PARAM_TAX_MAPPING_FILE) @@ -939,6 +958,7 @@ class Parameters { std::vector tsv2db; std::vector lca; std::vector addtaxonomy; + std::vector taxonomyreport; std::vector filtertaxdb; std::vector taxonomy; std::vector easytaxonomy; @@ -964,6 +984,8 @@ class Parameters { void overrideParameterDescription(Command& command, int uid, const char* description, const char* regex = NULL, int category = 0); + static void checkIfTaxDbIsComplete(std::string & filename); + static bool isEqualDbtype(const int type1, const int type2) { return ((type1 & 0x3FFFFFFF) == (type2 & 0x3FFFFFFF)); } diff --git a/src/commons/PatternCompiler.h b/src/commons/PatternCompiler.h index b9e28ec..1b7dba4 100644 --- a/src/commons/PatternCompiler.h +++ b/src/commons/PatternCompiler.h @@ -23,6 +23,11 @@ class PatternCompiler { return regexec(®ex, target, 0, NULL, 0) == 0; } + bool isMatch(const char *target, size_t nmatch, regmatch_t *pmatch) { + return regexec(®ex, target, nmatch, pmatch, 0) == 0; + } + + private: regex_t regex; }; diff --git a/src/commons/Sequence.cpp b/src/commons/Sequence.cpp index 88064a8..91de4d0 100644 --- a/src/commons/Sequence.cpp +++ b/src/commons/Sequence.cpp @@ -494,7 +494,6 @@ void Sequence::mapSequence(const char * sequence, unsigned int dataLen){ this->L = l; } - void Sequence::printPSSM(){ printf("Query profile of sequence %d\n", dbKey); printf("Pos "); diff --git a/src/commons/SubstitutionMatrix.cpp b/src/commons/SubstitutionMatrix.cpp index b52a13e..d6d35ae 100644 --- a/src/commons/SubstitutionMatrix.cpp +++ b/src/commons/SubstitutionMatrix.cpp @@ -6,6 +6,7 @@ #include "blosum62.out.h" #include "PAM30.out.h" #include "VTML80.out.h" +#include "VTML40.out.h" #include "nucleotide.out.h" @@ -26,6 +27,9 @@ SubstitutionMatrix::SubstitutionMatrix(const char *filename, float bitFactor, fl } else if (strcmp(parsedMatrix.first.c_str(), "VTML80.out") == 0) { matrixData = std::string((const char *)VTML80_out, VTML80_out_len); matrixName = "VTML80.out"; + } else if (strcmp(parsedMatrix.first.c_str(), "VTML40.out") == 0) { + matrixData = std::string((const char *)VTML40_out, VTML40_out_len); + matrixName = "VTML40.out"; } else if (strcmp(parsedMatrix.first.c_str(), "PAM30.out") == 0) { matrixData = std::string((const char *)PAM30_out, PAM30_out_len); matrixName = "PAM30.out"; diff --git a/src/commons/Util.cpp b/src/commons/Util.cpp index 79b3129..8000fab 100644 --- a/src/commons/Util.cpp +++ b/src/commons/Util.cpp @@ -16,9 +16,9 @@ #include "simd.h" #include "MemoryMapped.h" - #include #include +#include // std::ifstream #ifdef OPENMP #include @@ -244,7 +244,9 @@ std::pair Util::getFastaHeaderPosition(const std::string& heade } -std::string Util::parseFastaHeader(const std::string& header) { +std::string Util::parseFastaHeader(const char * headerPtr) { + size_t len = Util::skipNoneWhitespace(headerPtr); + std::string header(headerPtr, len); std::pair pos = Util::getFastaHeaderPosition(header); if(pos.first == -1 && pos.second == -1) return ""; @@ -462,6 +464,35 @@ int Util::omp_thread_count() { return n; } +std::map Util::readLookup(const std::string& file, const bool removeSplit) { + std::map mapping; + if (file.length() > 0) { + std::ifstream mappingStream(file); + if (mappingStream.fail()) { + Debug(Debug::ERROR) << "File " << file << " not found!\n"; + EXIT(EXIT_FAILURE); + } + + std::string line; + while (std::getline(mappingStream, line)) { + std::vector split = Util::split(line, "\t"); + unsigned int id = strtoul(split[0].c_str(), NULL, 10); + + std::string& name = split[1]; + + size_t pos; + if (removeSplit && (pos = name.find_last_of('_')) != std::string::npos) { + name = name.substr(0, pos); + } + + mapping.emplace(id, name); + } + } + + return mapping; +} + + std::string Util::removeWhiteSpace(std::string in) { in.erase(std::remove_if(in.begin(), in.end(), isspace), in.end()); return in; @@ -589,26 +620,6 @@ uint64_t Util::revComplement(const uint64_t kmer, const int k) { } - - -float Util::averageValueOnAminoAcids(const std::unordered_map &values, const char *seq) { - const char *seqPointer = seq; - float ret = values.at('0') + values.at('1'); // C ter and N ter values - std::unordered_map::const_iterator k; - - while (*seqPointer != '\0' && *seqPointer != '\n') { - if ((k = values.find(tolower(*seqPointer))) != values.end()) { - ret += k->second; - } - - seqPointer++; - } - - size_t seqLen = seqPointer - seq; - return ret / std::max(static_cast(1), seqLen); -} - - template<> std::string SSTR(char x) { return std::string(1, x); } template<> std::string SSTR(const std::string &x) { return x; } template<> std::string SSTR(const char* x) { return x; } diff --git a/src/commons/Util.h b/src/commons/Util.h index a777230..e4c51a7 100644 --- a/src/commons/Util.h +++ b/src/commons/Util.h @@ -5,14 +5,10 @@ #include #include #include -#include -#include -#include #include - +#include #include "MMseqsMPI.h" - #ifndef EXIT #define EXIT(exitCode) do { int __status = (exitCode); std::cerr.flush(); std::cout.flush(); exit(__status); } while(0) #endif @@ -21,8 +17,6 @@ #define BIT_CLEAR(a,b) ((a) & ~(1ULL<<(b))) #define BIT_CHECK(a,b) (!!((a) & (1ULL<<(b)))) - - template struct assert_false : std::false_type { }; @@ -246,8 +240,8 @@ class Util { } - static std::pair getFastaHeaderPosition(const std::string& header); - static std::string parseFastaHeader(const std::string& header); + static std::pair getFastaHeaderPosition(const std::string & header); + static std::string parseFastaHeader(const char * header); static inline char toUpper(char character){ character += ('a' <= character && character <= 'z') ? ('A' - 'a') : 0; @@ -316,6 +310,9 @@ class Util { static std::string removeWhiteSpace(std::string in); + static std::map readLookup(const std::string& lookupFile, + const bool removeSplit = false); + static bool canBeCovered(const float covThr, const int covMode, float queryLength, float targetLength); static bool hasCoverage(float covThr, int covMode, float queryCov, float targetCov); @@ -326,8 +323,6 @@ class Util { static uint64_t revComplement(const uint64_t kmer, const int k); - static float averageValueOnAminoAcids(const std::unordered_map &values, const char *seq); - static bool hasAlignmentLength(int alnLenThr, int alnLen) { return alnLen >= alnLenThr; } diff --git a/src/linclust/LinsearchIndexReader.cpp b/src/linclust/LinsearchIndexReader.cpp index 86adda2..a3eaf9c 100644 --- a/src/linclust/LinsearchIndexReader.cpp +++ b/src/linclust/LinsearchIndexReader.cpp @@ -17,7 +17,7 @@ template -size_t LinsearchIndexReader::pickCenterKmer(KmerPosition *hashSeqPair, size_t splitKmerCount) { +size_t LinsearchIndexReader::pickCenterKmer(KmerPosition *hashSeqPair, size_t splitKmerCount) { size_t writePos = 0; size_t prevHash = hashSeqPair[0].kmer; if (TYPE == Parameters::DBTYPE_NUCLEOTIDES) { @@ -61,8 +61,8 @@ size_t LinsearchIndexReader::pickCenterKmer(KmerPosition *hashSeqPair, size_t sp return writePos; } -template size_t LinsearchIndexReader::pickCenterKmer<0>(KmerPosition *hashSeqPair, size_t splitKmerCount); -template size_t LinsearchIndexReader::pickCenterKmer<1>(KmerPosition *hashSeqPair, size_t splitKmerCount); +template size_t LinsearchIndexReader::pickCenterKmer<0>(KmerPosition *hashSeqPair, size_t splitKmerCount); +template size_t LinsearchIndexReader::pickCenterKmer<1>(KmerPosition *hashSeqPair, size_t splitKmerCount); template void LinsearchIndexReader::mergeAndWriteIndex(DBWriter & dbw, std::vector tmpFiles, int alphSize, int kmerSize) { @@ -72,7 +72,7 @@ void LinsearchIndexReader::mergeAndWriteIndex(DBWriter & dbw, std::vector **entries = new KmerPosition*[fileCnt]; size_t * entrySizes = new size_t[fileCnt]; size_t * offsetPos = new size_t[fileCnt]; size_t * dataSizes = new size_t[fileCnt]; @@ -80,9 +80,9 @@ void LinsearchIndexReader::mergeAndWriteIndex(DBWriter & dbw, std::vector*)FileUtil::mmapFile(files[file], &dataSize); dataSizes[file] = dataSize; - entrySizes[file] = dataSize/sizeof(KmerPosition); + entrySizes[file] = dataSize/sizeof(KmerPosition); offsetPos[file] = 0; } std::priority_queue, CompareRepSequenceAndIdAndDiag> queue; @@ -90,7 +90,7 @@ void LinsearchIndexReader::mergeAndWriteIndex(DBWriter & dbw, std::vector currKmerPosition = entries[file][offset]; size_t currKmer = currKmerPosition.kmer; bool isReverse = false; if(TYPE == Parameters::DBTYPE_NUCLEOTIDES){ @@ -174,7 +174,7 @@ template void LinsearchIndexReader::mergeAndWriteIndex<1>(DBWriter & dbw, std::v template void LinsearchIndexReader::writeIndex(DBWriter & dbw, - KmerPosition *hashSeqPair, size_t totalKmers, + KmerPosition *hashSeqPair, size_t totalKmers, int alphSize, int kmerSize) { KmerIndex kmerIndex(alphSize - 1, kmerSize); @@ -221,10 +221,10 @@ void LinsearchIndexReader::writeIndex(DBWriter & dbw, } template void LinsearchIndexReader::writeIndex<0>(DBWriter & dbw, - KmerPosition *hashSeqPair, size_t totalKmers, + KmerPosition *hashSeqPair, size_t totalKmers, int alphSize, int kmerSize); template void LinsearchIndexReader::writeIndex<1>(DBWriter & dbw, - KmerPosition *hashSeqPair, size_t totalKmers, + KmerPosition *hashSeqPair, size_t totalKmers, int alphSize, int kmerSize); std::string LinsearchIndexReader::indexName(std::string baseName) { @@ -241,10 +241,10 @@ bool LinsearchIndexReader::checkIfIndexFile(DBReader *pReader) { return (strncmp(version, PrefilteringIndexReader::CURRENT_VERSION, strlen(PrefilteringIndexReader::CURRENT_VERSION)) == 0 ) ? true : false; } -void LinsearchIndexReader::writeKmerIndexToDisk(std::string fileName, KmerPosition *kmers, size_t kmerCnt){ +void LinsearchIndexReader::writeKmerIndexToDisk(std::string fileName, KmerPosition *kmers, size_t kmerCnt){ FILE* filePtr = fopen(fileName.c_str(), "wb"); if(filePtr == NULL) { perror(fileName.c_str()); EXIT(EXIT_FAILURE); } - fwrite(kmers, sizeof(KmerPosition), kmerCnt, filePtr); + fwrite(kmers, sizeof(KmerPosition), kmerCnt, filePtr); fclose(filePtr); } diff --git a/src/linclust/LinsearchIndexReader.h b/src/linclust/LinsearchIndexReader.h index 16d11fb..13516ee 100644 --- a/src/linclust/LinsearchIndexReader.h +++ b/src/linclust/LinsearchIndexReader.h @@ -42,19 +42,19 @@ class LinsearchIndexReader { template - static size_t pickCenterKmer(KmerPosition *kmers, size_t splitKmerCount); + static size_t pickCenterKmer(KmerPosition *kmers, size_t splitKmerCount); template static void mergeAndWriteIndex(DBWriter &dbw, std::vector tmpFiles, int alphSize, int kmerSize); template static void writeIndex(DBWriter &dbw, - KmerPosition *hashSeqPair, size_t totalKmers, + KmerPosition *hashSeqPair, size_t totalKmers, int alphSize, int kmerSize); static std::string indexName(std::string baseName); - static void writeKmerIndexToDisk(std::string fileName, KmerPosition *kmers, size_t kmerCnt); + static void writeKmerIndexToDisk(std::string fileName, KmerPosition *kmers, size_t kmerCnt); static bool checkIfIndexFile(DBReader *pReader); diff --git a/src/linclust/kmerindexdb.cpp b/src/linclust/kmerindexdb.cpp index a459071..5165e0d 100644 --- a/src/linclust/kmerindexdb.cpp +++ b/src/linclust/kmerindexdb.cpp @@ -78,7 +78,7 @@ int kmerindexdb(int argc, const char **argv, const Command &command) { } Debug(Debug::INFO) << "\n"; size_t totalKmers = computeKmerCount(seqDbr, KMER_SIZE, chooseTopKmer); - size_t totalSizeNeeded = computeMemoryNeededLinearfilter(totalKmers); + size_t totalSizeNeeded = computeMemoryNeededLinearfilter(totalKmers); Debug(Debug::INFO) << "Estimated memory consumption " << totalSizeNeeded/1024/1024 << " MB\n"; // compute splits size_t splits = static_cast(std::ceil(static_cast(totalSizeNeeded) / memoryLimit)); @@ -90,7 +90,7 @@ int kmerindexdb(int argc, const char **argv, const Command &command) { Debug(Debug::INFO) << "Process file into " << splits << " parts\n"; std::vector splitFiles; - KmerPosition *hashSeqPair = NULL; + KmerPosition *hashSeqPair = NULL; size_t writePos = 0; size_t mpiRank = 0; diff --git a/src/linclust/kmermatcher.cpp b/src/linclust/kmermatcher.cpp index c0a2523..d9a40f0 100644 --- a/src/linclust/kmermatcher.cpp +++ b/src/linclust/kmermatcher.cpp @@ -60,25 +60,25 @@ unsigned circ_hash_next(const int * x, unsigned length, int x_first, short unsig #undef RoL - -KmerPosition *initKmerPositionMemory(size_t size) { - KmerPosition * hashSeqPair = new(std::nothrow) KmerPosition[size + 1]; +template +KmerPosition *initKmerPositionMemory(size_t size) { + KmerPosition * hashSeqPair = new(std::nothrow) KmerPosition[size + 1]; Util::checkAllocation(hashSeqPair, "Can not allocate memory"); - size_t pageSize = Util::getPageSize()/sizeof(KmerPosition); + size_t pageSize = Util::getPageSize()/sizeof(KmerPosition); #pragma omp parallel { #pragma omp for schedule(dynamic, 1) for (size_t page = 0; page < size+1; page += pageSize) { size_t readUntil = std::min(size+1, page + pageSize) - page; - memset(hashSeqPair+page, 0xFF, sizeof(KmerPosition)* readUntil); + memset(hashSeqPair+page, 0xFF, sizeof(KmerPosition)* readUntil); } } return hashSeqPair; } -template -std::pair fillKmerPositionArray(KmerPosition * hashSeqPair, DBReader &seqDbr, +template +std::pair fillKmerPositionArray(KmerPosition * hashSeqPair, DBReader &seqDbr, Parameters & par, BaseMatrix * subMat, const size_t KMER_SIZE, size_t chooseTopKmer, bool includeIdenticalKmer, size_t splits, @@ -155,7 +155,7 @@ std::pair fillKmerPositionArray(KmerPosition * hashSeqPair, DBRe char * charSequence = new char[par.maxSeqLen + 1]; const unsigned int BUFFER_SIZE = 1024; size_t bufferPos = 0; - KmerPosition * threadKmerBuffer = new KmerPosition[BUFFER_SIZE]; + KmerPosition * threadKmerBuffer = new KmerPosition[BUFFER_SIZE]; SequencePosition * kmers = new SequencePosition[(pickNBest * (par.maxSeqLen + 1)) + 1]; int highestSeq[32]; for(size_t i = 0; i< KMER_SIZE; i++){ @@ -168,7 +168,7 @@ std::pair fillKmerPositionArray(KmerPosition * hashSeqPair, DBRe size_t start = (i * flushSize); size_t bucketSize = std::min(seqDbr.getSize() - (i * flushSize), flushSize); -#pragma omp for schedule(dynamic, 100) +#pragma omp for schedule(dynamic, 10) for (size_t id = start; id < (start + bucketSize); id++) { progress.updateProgress(); seq.mapSequence(id, seqDbr.getDbKey(id), seqDbr.getData(id, thread_idx), seqDbr.getSeqLen(id)); @@ -307,25 +307,6 @@ std::pair fillKmerPositionArray(KmerPosition * hashSeqPair, DBRe std::sort(kmers, kmers + seqKmerCount, SequencePosition::compareByScore); } } - size_t kmerConsidered = std::min(static_cast(chooseTopKmer - 1 + (chooseTopKmerScale * seq.L)), seqKmerCount); - if(par.skipNRepeatKmer > 0 ){ - size_t prevKmer = SIZE_T_MAX; - kmers[seqKmerCount].kmer=SIZE_T_MAX; - int repeatKmerCnt = 0; - for (int topKmer = 0; topKmer < seqKmerCount; topKmer++) { - size_t kmerCurr = (kmers + topKmer)->kmer; - size_t kmerNext = (kmers + topKmer + 1)->kmer; - if(TYPE == Parameters::DBTYPE_NUCLEOTIDES) { - kmerCurr = BIT_SET(kmerCurr, 63); - kmerNext = BIT_SET(kmerNext, 63); - } - repeatKmerCnt += (kmerCurr == kmerNext || kmerCurr == prevKmer); - prevKmer = kmerCurr; - } - if(repeatKmerCnt >= par.skipNRepeatKmer){ - kmerConsidered = 0; - } - } // add k-mer to represent the identity //TODO, how to handle this in reverse? @@ -337,16 +318,37 @@ std::pair fillKmerPositionArray(KmerPosition * hashSeqPair, DBRe bufferPos++; if (bufferPos >= BUFFER_SIZE) { size_t writeOffset = __sync_fetch_and_add(&offset, bufferPos); - memcpy(hashSeqPair + writeOffset, threadKmerBuffer, sizeof(KmerPosition) * bufferPos); + memcpy(hashSeqPair + writeOffset, threadKmerBuffer, sizeof(KmerPosition) * bufferPos); bufferPos = 0; } } - for (size_t topKmer = 0; topKmer < kmerConsidered; topKmer++) { - size_t kmer = (kmers + topKmer)->kmer; - if(TYPE == Parameters::DBTYPE_NUCLEOTIDES) { - kmer = BIT_SET(kmer, 63); + + size_t kmersToConsider = std::min(static_cast(chooseTopKmer - 1 + (chooseTopKmerScale * seq.L)), seqKmerCount); + size_t kmersConsidered = 0; + + size_t prevKmer = SIZE_T_MAX; + kmers[seqKmerCount].kmer = SIZE_T_MAX; + for (size_t topKmer = 0; topKmer < static_cast(seqKmerCount) && + kmersConsidered < kmersToConsider; topKmer++) { + + size_t kmerCurr = (kmers + topKmer)->kmer; + size_t kmerNext = (kmers + topKmer + 1)->kmer; + + if (TYPE == Parameters::DBTYPE_NUCLEOTIDES) { + kmerCurr = BIT_SET(kmerCurr, 63); + kmerNext = BIT_SET(kmerNext, 63); + } + + bool repeatedKmer = (kmerCurr == kmerNext || kmerCurr == prevKmer); + prevKmer = kmerCurr; + + /* skip repeated kmer */ + if (par.ignoreMultiKmer > 0 && repeatedKmer) { + continue; } - size_t splitIdx = kmer % splits; + + kmersConsidered++; + size_t splitIdx = kmerCurr % splits; if (splitIdx != split) { continue; } @@ -358,7 +360,7 @@ std::pair fillKmerPositionArray(KmerPosition * hashSeqPair, DBRe bufferPos++; if (bufferPos >= BUFFER_SIZE) { size_t writeOffset = __sync_fetch_and_add(&offset, bufferPos); - memcpy(hashSeqPair + writeOffset, threadKmerBuffer, sizeof(KmerPosition) * bufferPos); + memcpy(hashSeqPair + writeOffset, threadKmerBuffer, sizeof(KmerPosition) * bufferPos); bufferPos = 0; } } @@ -376,7 +378,7 @@ std::pair fillKmerPositionArray(KmerPosition * hashSeqPair, DBRe if(bufferPos > 0){ size_t writeOffset = __sync_fetch_and_add(&offset, bufferPos); - memcpy(hashSeqPair+writeOffset, threadKmerBuffer, sizeof(KmerPosition) * bufferPos); + memcpy(hashSeqPair+writeOffset, threadKmerBuffer, sizeof(KmerPosition) * bufferPos); } delete[] kmers; delete[] charSequence; @@ -397,8 +399,8 @@ std::pair fillKmerPositionArray(KmerPosition * hashSeqPair, DBRe return std::make_pair(offset, longestKmer); } - -KmerPosition * doComputation(size_t totalKmers, size_t split, size_t splits, std::string splitFile, +template +KmerPosition * doComputation(size_t totalKmers, size_t split, size_t splits, std::string splitFile, DBReader & seqDbr, Parameters & par, BaseMatrix * subMat, size_t KMER_SIZE, size_t chooseTopKmer, bool adjustLength, float chooseTopKmerScale) { @@ -413,18 +415,18 @@ KmerPosition * doComputation(size_t totalKmers, size_t split, size_t splits, std memoryLimit = static_cast(Util::getTotalSystemMemory() * 0.9); } // we do not really know how much memory is needed. So this is our best choice - splitKmerCount = (memoryLimit / sizeof(KmerPosition)); + splitKmerCount = (memoryLimit / sizeof(KmerPosition)); } - KmerPosition * hashSeqPair = initKmerPositionMemory(splitKmerCount); + KmerPosition * hashSeqPair = initKmerPositionMemory(splitKmerCount); size_t elementsToSort; if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)){ - std::pair ret = fillKmerPositionArray(hashSeqPair, seqDbr, par, subMat, KMER_SIZE, chooseTopKmer, true, splits, split, 1, adjustLength, chooseTopKmerScale); + std::pair ret = fillKmerPositionArray(hashSeqPair, seqDbr, par, subMat, KMER_SIZE, chooseTopKmer, true, splits, split, 1, adjustLength, chooseTopKmerScale); elementsToSort = ret.first; KMER_SIZE = ret.second; Debug(Debug::INFO) << "\nAdjusted k-mer length " << KMER_SIZE << "\n"; }else{ - std::pair ret = fillKmerPositionArray(hashSeqPair, seqDbr, par, subMat, KMER_SIZE, chooseTopKmer, true, splits, split, 1, false, chooseTopKmerScale); + std::pair ret = fillKmerPositionArray(hashSeqPair, seqDbr, par, subMat, KMER_SIZE, chooseTopKmer, true, splits, split, 1, false, chooseTopKmerScale); elementsToSort = ret.first; } if(splits == 1){ @@ -434,9 +436,9 @@ KmerPosition * doComputation(size_t totalKmers, size_t split, size_t splits, std Debug(Debug::INFO) << "Sort kmer "; Timer timer; if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)) { - omptl::sort(hashSeqPair, hashSeqPair + elementsToSort, KmerPosition::compareRepSequenceAndIdAndPosReverse); + omptl::sort(hashSeqPair, hashSeqPair + elementsToSort, KmerPosition::compareRepSequenceAndIdAndPosReverse); }else{ - omptl::sort(hashSeqPair, hashSeqPair + elementsToSort, KmerPosition::compareRepSequenceAndIdAndPos); + omptl::sort(hashSeqPair, hashSeqPair + elementsToSort, KmerPosition::compareRepSequenceAndIdAndPos); } Debug(Debug::INFO) << timer.lap() << "\n"; @@ -453,18 +455,18 @@ KmerPosition * doComputation(size_t totalKmers, size_t split, size_t splits, std // The longest sequence is the first since we sorted by kmer, seq.Len and id size_t writePos; if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)){ - writePos = assignGroup(hashSeqPair, splitKmerCount, par.includeOnlyExtendable, par.covMode, par.covThr); + writePos = assignGroup(hashSeqPair, splitKmerCount, par.includeOnlyExtendable, par.covMode, par.covThr); }else{ - writePos = assignGroup(hashSeqPair, splitKmerCount, par.includeOnlyExtendable, par.covMode, par.covThr); + writePos = assignGroup(hashSeqPair, splitKmerCount, par.includeOnlyExtendable, par.covMode, par.covThr); } // sort by rep. sequence (stored in kmer) and sequence id Debug(Debug::INFO) << "Sort by rep. sequence "; timer.reset(); if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)){ - omptl::sort(hashSeqPair, hashSeqPair + writePos, KmerPosition::compareRepSequenceAndIdAndDiagReverse); + omptl::sort(hashSeqPair, hashSeqPair + writePos, KmerPosition::compareRepSequenceAndIdAndDiagReverse); }else{ - omptl::sort(hashSeqPair, hashSeqPair + writePos, KmerPosition::compareRepSequenceAndIdAndDiag); + omptl::sort(hashSeqPair, hashSeqPair + writePos, KmerPosition::compareRepSequenceAndIdAndDiag); } //kx::radix_sort(hashSeqPair, hashSeqPair + elementsToSort, SequenceComparision()); // for(size_t i = 0; i < writePos; i++){ @@ -474,9 +476,9 @@ KmerPosition * doComputation(size_t totalKmers, size_t split, size_t splits, std if(splits > 1){ if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)){ - writeKmersToDisk(splitFile, hashSeqPair, writePos + 1); + writeKmersToDisk(splitFile, hashSeqPair, writePos + 1); }else{ - writeKmersToDisk(splitFile, hashSeqPair, writePos + 1); + writeKmersToDisk(splitFile, hashSeqPair, writePos + 1); } delete [] hashSeqPair; hashSeqPair = NULL; @@ -484,8 +486,8 @@ KmerPosition * doComputation(size_t totalKmers, size_t split, size_t splits, std return hashSeqPair; } -template -size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr) { +template +size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr) { size_t writePos=0; size_t prevHash = hashSeqPair[0].kmer; size_t repSeqId = hashSeqPair[0].id; @@ -496,9 +498,9 @@ size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool includ } size_t prevHashStart = 0; size_t prevSetSize = 0; - unsigned short queryLen=hashSeqPair[0].seqLen; + T queryLen=hashSeqPair[0].seqLen; bool repIsReverse = false; - unsigned int repSeq_i_pos = hashSeqPair[0].pos; + T repSeq_i_pos = hashSeqPair[0].pos; for (size_t elementIdx = 0; elementIdx < splitKmerCount+1; elementIdx++) { size_t currKmer = hashSeqPair[elementIdx].kmer; if(TYPE == Parameters::DBTYPE_NUCLEOTIDES){ @@ -513,7 +515,7 @@ size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool includ size_t rId = (kmer != SIZE_T_MAX) ? ((prevSetSize == 1) ? SIZE_T_MAX : repSeqId) : SIZE_T_MAX; // remove singletones from set if(rId != SIZE_T_MAX){ - short diagonal = repSeq_i_pos - hashSeqPair[i].pos; + int diagonal = repSeq_i_pos - hashSeqPair[i].pos; if(TYPE == Parameters::DBTYPE_NUCLEOTIDES){ // 00 No problem here both are forward // 01 We can revert the query of target, lets invert the query. @@ -524,8 +526,8 @@ size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool includ bool queryNeedsToBeRev = false; // we now need 2 byte of information (00),(01),(10),(11) // we need to flip the coordinates of the query - short queryPos=0; - short targetPos=0; + T queryPos=0; + T targetPos=0; // revert kmer in query hits normal kmer in target // we need revert the query if (repIsReverse == true && targetIsReverse == false){ @@ -595,9 +597,10 @@ size_t assignGroup(KmerPosition *hashSeqPair, size_t splitKmerCount, bool includ return writePos; } -template size_t assignGroup<0>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr); -template size_t assignGroup<1>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr); - +template size_t assignGroup<0, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr); +template size_t assignGroup<0, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr); +template size_t assignGroup<1, short>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr); +template size_t assignGroup<1, int>(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr); void setLinearFilterDefault(Parameters *p) { p->spacedKmer = false; @@ -619,27 +622,15 @@ size_t computeKmerCount(DBReader &reader, size_t KMER_SIZE, size_t } return totalKmers; } - +template size_t computeMemoryNeededLinearfilter(size_t totalKmer) { - return sizeof(KmerPosition) * totalKmer; + return sizeof(KmerPosition) * totalKmer; } -int kmermatcher(int argc, const char **argv, const Command &command) { - MMseqsMPI::init(argc, argv); - - Parameters &par = Parameters::getInstance(); - setLinearFilterDefault(&par); - par.parseParameters(argc, argv, command, true, 0, MMseqsParameter::COMMAND_CLUSTLINEAR); - - DBReader seqDbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); - seqDbr.open(DBReader::NOSORT); - int querySeqType = seqDbr.getDbtype(); - - setKmerLengthAndAlphabet(par, seqDbr.getAminoAcidDBSize(), querySeqType); - std::vector* params = command.params; - par.printParameters(command.cmd, argc, argv, *params); - Debug(Debug::INFO) << "Database size: " << seqDbr.getSize() << " type: " << seqDbr.getDbTypeName() << "\n"; +template +int kmermatcherInner(Parameters& par, DBReader& seqDbr) { + int querySeqType = seqDbr.getDbtype(); BaseMatrix *subMat; if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES)) { subMat = new NucleotideMatrix(par.scoringMatrixFile.nucleotides, 1.0, 0.0); @@ -666,7 +657,7 @@ int kmermatcher(int argc, const char **argv, const Command &command) { } Debug(Debug::INFO) << "\n"; size_t totalKmers = computeKmerCount(seqDbr, KMER_SIZE, chooseTopKmer, chooseTopKmerScale); - size_t totalSizeNeeded = computeMemoryNeededLinearfilter(totalKmers); + size_t totalSizeNeeded = computeMemoryNeededLinearfilter(totalKmers); Debug(Debug::INFO) << "Estimated memory consumption " << totalSizeNeeded/1024/1024 << " MB\n"; // compute splits size_t splits = static_cast(std::ceil(static_cast(totalSizeNeeded) / memoryLimit)); @@ -679,7 +670,7 @@ int kmermatcher(int argc, const char **argv, const Command &command) { Debug(Debug::INFO) << "Process file into " << splits << " parts\n"; } std::vector splitFiles; - KmerPosition *hashSeqPair = NULL; + KmerPosition *hashSeqPair = NULL; size_t mpiRank = 0; #ifdef HAVE_MPI @@ -702,7 +693,7 @@ int kmermatcher(int argc, const char **argv, const Command &command) { for(size_t split = fromSplit; split < fromSplit+splitCount; split++) { std::string splitFileName = par.db2 + "_split_" +SSTR(split); - hashSeqPair = doComputation(totalKmers, split, splits, splitFileName, seqDbr, par, subMat, KMER_SIZE, chooseTopKmer, par.adjustKmerLength, chooseTopKmerScale); + hashSeqPair = doComputation(totalKmers, split, splits, splitFileName, seqDbr, par, subMat, KMER_SIZE, chooseTopKmer, par.adjustKmerLength, chooseTopKmerScale); } MPI_Barrier(MPI_COMM_WORLD); if(mpiRank == 0){ @@ -717,7 +708,7 @@ int kmermatcher(int argc, const char **argv, const Command &command) { std::string splitFileNameDone = splitFileName + ".done"; if(FileUtil::fileExists(splitFileNameDone.c_str()) == false){ - hashSeqPair = doComputation(totalKmers, split, splits, splitFileName, seqDbr, par, subMat, KMER_SIZE, chooseTopKmer, par.adjustKmerLength, chooseTopKmerScale); + hashSeqPair = doComputation(totalKmers, split, splits, splitFileName, seqDbr, par, subMat, KMER_SIZE, chooseTopKmer, par.adjustKmerLength, chooseTopKmerScale); } splitFiles.push_back(splitFileName); @@ -782,14 +773,42 @@ int kmermatcher(int argc, const char **argv, const Command &command) { if(hashSeqPair){ delete [] hashSeqPair; } + + return EXIT_SUCCESS; +} + +int kmermatcher(int argc, const char **argv, const Command &command) { + MMseqsMPI::init(argc, argv); + + Parameters &par = Parameters::getInstance(); + setLinearFilterDefault(&par); + par.parseParameters(argc, argv, command, true, 0, MMseqsParameter::COMMAND_CLUSTLINEAR); + + DBReader seqDbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, + DBReader::USE_INDEX | DBReader::USE_DATA); + seqDbr.open(DBReader::NOSORT); + int querySeqType = seqDbr.getDbtype(); + + setKmerLengthAndAlphabet(par, seqDbr.getAminoAcidDBSize(), querySeqType); + std::vector *params = command.params; + par.printParameters(command.cmd, argc, argv, *params); + Debug(Debug::INFO) << "Database size: " << seqDbr.getSize() << " type: " << seqDbr.getDbTypeName() << "\n"; + + if (seqDbr.getMaxSeqLen() < SHRT_MAX) { + kmermatcherInner(par, seqDbr); + } + else { + kmermatcherInner(par, seqDbr); + } + seqDbr.close(); return EXIT_SUCCESS; } -template +template void writeKmerMatcherResult(DBWriter & dbw, - KmerPosition *hashSeqPair, size_t totalKmers, + KmerPosition *hashSeqPair, size_t totalKmers, std::vector &repSequence, size_t threads) { std::vector threadOffsets; size_t splitSize = totalKmers/threads; @@ -858,9 +877,13 @@ void writeKmerMatcherResult(DBWriter & dbw, size_t topScore =0; int bestReverMask = reverMask; // compute best diagonal and score for every group of target sequences + while(lastTargetId != targetId - && kmerPos+kmerOffset < threadOffsets[thread+1] - && hashSeqPair[kmerPos+kmerOffset].id == targetId){ + && kmerPos+kmerOffset < threadOffsets[thread+1] + && hashSeqPair[kmerPos+kmerOffset].id == targetId + && ((TYPE ==Parameters::DBTYPE_NUCLEOTIDES)? BIT_CLEAR(hashSeqPair[kmerPos+kmerOffset].kmer, 63): + hashSeqPair[kmerPos+kmerOffset].kmer) == repSeqId){ + if(prevDiagonal == hashSeqPair[kmerPos+kmerOffset].pos){ diagonalCnt++; }else{ @@ -1075,11 +1098,11 @@ void mergeKmerFilesAndOutput(DBWriter & dbw, } -template -void writeKmersToDisk(std::string tmpFile, KmerPosition *hashSeqPair, size_t totalKmers) { +template +void writeKmersToDisk(std::string tmpFile, KmerPosition *hashSeqPair, size_t totalKmers) { size_t repSeqId = SIZE_T_MAX; size_t lastTargetId = SIZE_T_MAX; - short lastDiagonal=0; + seqLenType lastDiagonal=0; int diagonalScore=0; FILE* filePtr = fopen(tmpFile.c_str(), "wb"); if(filePtr == NULL) { perror(tmpFile.c_str()); EXIT(EXIT_FAILURE); } @@ -1118,7 +1141,7 @@ void writeKmersToDisk(std::string tmpFile, KmerPosition *hashSeqPair, size_t tot } unsigned int targetId = hashSeqPair[kmerPos].id; - unsigned short diagonal = hashSeqPair[kmerPos].pos; + seqLenType diagonal = hashSeqPair[kmerPos].pos; int forward = 0; int reverse = 0; // find diagonal score @@ -1132,7 +1155,7 @@ void writeKmersToDisk(std::string tmpFile, KmerPosition *hashSeqPair, size_t tot reverse += isReverse == true; } kmerPos++; - }while(targetId == hashSeqPair[kmerPos].id && hashSeqPair[kmerPos].id == diagonal && kmerPos < totalKmers && hashSeqPair[kmerPos].kmer != SIZE_T_MAX); + }while(targetId == hashSeqPair[kmerPos].id && hashSeqPair[kmerPos].pos == diagonal && kmerPos < totalKmers && hashSeqPair[kmerPos].kmer != SIZE_T_MAX); kmerPos--; elemenetCnt++; @@ -1191,22 +1214,59 @@ void setKmerLengthAndAlphabet(Parameters ¶meters, size_t aaDbSize, int seqTy } } -template std::pair fillKmerPositionArray<0>(KmerPosition * hashSeqPair, DBReader &seqDbr, - Parameters & par, BaseMatrix * subMat, - size_t KMER_SIZE, size_t chooseTopKmer, - bool includeIdenticalKmer, size_t splits, size_t split, size_t pickNBest, - bool adjustKmerLength, - float chooseTopKmerScale); -template std::pair fillKmerPositionArray<1>(KmerPosition * hashSeqPair, DBReader &seqDbr, - Parameters & par, BaseMatrix * subMat, - size_t KMER_SIZE, size_t chooseTopKmer, - bool includeIdenticalKmer, size_t splits, size_t split, size_t pickNBest, - bool adjustKmerLength, - float chooseTopKmerScale); -template std::pair fillKmerPositionArray<2>(KmerPosition * hashSeqPair, DBReader &seqDbr, - Parameters & par, BaseMatrix * subMat, - size_t KMER_SIZE, size_t chooseTopKmer, - bool includeIdenticalKmer, size_t splits, size_t split, size_t pickNBest, - bool adjustKmerLength, - float chooseTopKmerScale); +template std::pair fillKmerPositionArray<0, short>(KmerPosition * hashSeqPair, + DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, + size_t KMER_SIZE, size_t chooseTopKmer, + bool includeIdenticalKmer, size_t splits, size_t split, + size_t pickNBest, + bool adjustKmerLength, + float chooseTopKmerScale); +template std::pair fillKmerPositionArray<1, short>(KmerPosition * hashSeqPair, + DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, + size_t KMER_SIZE, size_t chooseTopKmer, + bool includeIdenticalKmer, size_t splits, size_t split, + size_t pickNBest, + bool adjustKmerLength, + float chooseTopKmerScale); +template std::pair fillKmerPositionArray<2, short>(KmerPosition * hashSeqPair, + DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, + size_t KMER_SIZE, size_t chooseTopKmer, + bool includeIdenticalKmer, size_t splits, size_t split, + size_t pickNBest, + bool adjustKmerLength, + float chooseTopKmerScale); +template std::pair fillKmerPositionArray<0, int>(KmerPosition * hashSeqPair, + DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, + size_t KMER_SIZE, size_t chooseTopKmer, + bool includeIdenticalKmer, size_t splits, size_t split, + size_t pickNBest, + bool adjustKmerLength, + float chooseTopKmerScale); +template std::pair fillKmerPositionArray<1, int>(KmerPosition * hashSeqPair, + DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, + size_t KMER_SIZE, size_t chooseTopKmer, + bool includeIdenticalKmer, size_t splits, size_t split, + size_t pickNBest, + bool adjustKmerLength, + float chooseTopKmerScale); +template std::pair fillKmerPositionArray<2, int>(KmerPosition< int> * hashSeqPair, + DBReader &seqDbr, + Parameters & par, BaseMatrix * subMat, + size_t KMER_SIZE, size_t chooseTopKmer, + bool includeIdenticalKmer, size_t splits, size_t split, + size_t pickNBest, + bool adjustKmerLength, + float chooseTopKmerScale); + +template KmerPosition *initKmerPositionMemory(size_t size); +template KmerPosition *initKmerPositionMemory(size_t size); + +template size_t computeMemoryNeededLinearfilter(size_t totalKmer); +template size_t computeMemoryNeededLinearfilter(size_t totalKmer); + #undef SIZE_T_MAX diff --git a/src/linclust/kmermatcher.h b/src/linclust/kmermatcher.h index ce25c24..741cf4f 100644 --- a/src/linclust/kmermatcher.h +++ b/src/linclust/kmermatcher.h @@ -7,13 +7,14 @@ #include "Parameters.h" #include "BaseMatrix.h" -struct KmerPosition { +template +struct __attribute__((__packed__))KmerPosition { size_t kmer; unsigned int id; - unsigned short seqLen; - short pos; + T seqLen; + T pos; - static bool compareRepSequenceAndIdAndPos(const KmerPosition &first, const KmerPosition &second){ + static bool compareRepSequenceAndIdAndPos(const KmerPosition &first, const KmerPosition &second){ if(first.kmer < second.kmer ) return true; if(second.kmer < first.kmer ) @@ -33,7 +34,7 @@ struct KmerPosition { return false; } - static bool compareRepSequenceAndIdAndPosReverse(const KmerPosition &first, const KmerPosition &second){ + static bool compareRepSequenceAndIdAndPosReverse(const KmerPosition &first, const KmerPosition &second){ size_t firstKmer = BIT_SET(first.kmer, 63); size_t secondKmer = BIT_SET(second.kmer, 63); if(firstKmer < secondKmer ) @@ -55,7 +56,7 @@ struct KmerPosition { return false; } - static bool compareRepSequenceAndIdAndDiagReverse(const KmerPosition &first, const KmerPosition &second){ + static bool compareRepSequenceAndIdAndDiagReverse(const KmerPosition &first, const KmerPosition &second){ size_t firstKmer = BIT_SET(first.kmer, 63); size_t secondKmer = BIT_SET(second.kmer, 63); if(firstKmer < secondKmer) @@ -73,7 +74,7 @@ struct KmerPosition { return false; } - static bool compareRepSequenceAndIdAndDiag(const KmerPosition &first, const KmerPosition &second){ + static bool compareRepSequenceAndIdAndDiag(const KmerPosition &first, const KmerPosition &second){ if(first.kmer < second.kmer) return true; if(second.kmer < first.kmer) @@ -152,8 +153,8 @@ class CompareResultBySeqId { }; -template -size_t assignGroup(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr); +template +size_t assignGroup(KmerPosition *kmers, size_t splitKmerCount, bool includeOnlyExtendable, int covMode, float covThr); template void mergeKmerFilesAndOutput(DBWriter & dbw, std::vector tmpFiles, std::vector &repSequence); @@ -165,27 +166,28 @@ size_t queueNextEntry(KmerPositionQueue &queue, int file, size_t offsetPos, T *e void setKmerLengthAndAlphabet(Parameters ¶meters, size_t aaDbSize, int seqType); -template -void writeKmersToDisk(std::string tmpFile, KmerPosition *kmers, size_t totalKmers); +template +void writeKmersToDisk(std::string tmpFile, KmerPosition *kmers, size_t totalKmers); -template -void writeKmerMatcherResult(DBWriter & dbw, KmerPosition *hashSeqPair, size_t totalKmers, +template +void writeKmerMatcherResult(DBWriter & dbw, KmerPosition *hashSeqPair, size_t totalKmers, std::vector &repSequence, size_t threads); -KmerPosition * doComputation(size_t totalKmers, size_t split, size_t splits, std::string splitFile, +template +KmerPosition * doComputation(size_t totalKmers, size_t split, size_t splits, std::string splitFile, DBReader & seqDbr, Parameters & par, BaseMatrix * subMat, size_t KMER_SIZE, size_t chooseTopKmer, float chooseTopKmerScale = 0.0); +template +KmerPosition *initKmerPositionMemory(size_t size); -KmerPosition *initKmerPositionMemory(size_t size); - -template -std::pair fillKmerPositionArray(KmerPosition * hashSeqPair, DBReader &seqDbr, +template +std::pair fillKmerPositionArray(KmerPosition * hashSeqPair, DBReader &seqDbr, Parameters & par, BaseMatrix * subMat, const size_t KMER_SIZE, size_t chooseTopKmer, bool includeIdenticalKmer, size_t splits, size_t split, size_t pickNBest, bool adjustLength, float chooseTopKmerScale = 0.0); - +template size_t computeMemoryNeededLinearfilter(size_t totalKmer); size_t computeKmerCount(DBReader &reader, size_t KMER_SIZE, size_t chooseTopKmer, diff --git a/src/linclust/kmersearch.cpp b/src/linclust/kmersearch.cpp index ad7dcaa..03583e8 100644 --- a/src/linclust/kmersearch.cpp +++ b/src/linclust/kmersearch.cpp @@ -34,24 +34,24 @@ KmerSearch::ExtractKmerAndSortResult KmerSearch::extractKmerAndSort(size_t total memoryLimit = static_cast(Util::getTotalSystemMemory() * 0.9); } // we do not really know how much memory is needed. So this is our best choice - splitKmerCount = (memoryLimit / sizeof(KmerPosition)); + splitKmerCount = (memoryLimit / sizeof(KmerPosition)); } - KmerPosition * hashSeqPair = initKmerPositionMemory(splitKmerCount*pickNBest); + KmerPosition * hashSeqPair = initKmerPositionMemory(splitKmerCount*pickNBest); Timer timer; size_t elementsToSort; if(pickNBest > 1){ - std::pair ret = fillKmerPositionArray(hashSeqPair, seqDbr, par, subMat, KMER_SIZE, + std::pair ret = fillKmerPositionArray(hashSeqPair, seqDbr, par, subMat, KMER_SIZE, chooseTopKmer, false, splits, split, pickNBest, false); elementsToSort = ret.first; } else if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)){ - std::pair ret = fillKmerPositionArray(hashSeqPair, seqDbr, par, subMat, KMER_SIZE, + std::pair ret = fillKmerPositionArray(hashSeqPair, seqDbr, par, subMat, KMER_SIZE, chooseTopKmer, false, splits, split, 1, adjustLength); elementsToSort = ret.first; KMER_SIZE = ret.second; Debug(Debug::INFO) << "\nAdjusted k-mer length " << KMER_SIZE << "\n"; }else { - std::pair ret = fillKmerPositionArray(hashSeqPair, seqDbr, par, subMat, KMER_SIZE, + std::pair ret = fillKmerPositionArray(hashSeqPair, seqDbr, par, subMat, KMER_SIZE, chooseTopKmer, false, splits, split, 1, false); elementsToSort = ret.first; @@ -64,9 +64,9 @@ KmerSearch::ExtractKmerAndSortResult KmerSearch::extractKmerAndSort(size_t total Debug(Debug::INFO) << "Sort kmer ... "; timer.reset(); if(Parameters::isEqualDbtype(seqDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)) { - omptl::sort(hashSeqPair, hashSeqPair + elementsToSort, KmerPosition::compareRepSequenceAndIdAndPosReverse); + omptl::sort(hashSeqPair, hashSeqPair + elementsToSort, KmerPosition::compareRepSequenceAndIdAndPosReverse); }else{ - omptl::sort(hashSeqPair, hashSeqPair + elementsToSort, KmerPosition::compareRepSequenceAndIdAndPos); + omptl::sort(hashSeqPair, hashSeqPair + elementsToSort, KmerPosition::compareRepSequenceAndIdAndPos); } @@ -76,7 +76,7 @@ KmerSearch::ExtractKmerAndSortResult KmerSearch::extractKmerAndSort(size_t total } template -void KmerSearch::writeResult(DBWriter & dbw, KmerPosition *kmers, size_t kmerCount) { +void KmerSearch::writeResult(DBWriter & dbw, KmerPosition *kmers, size_t kmerCount) { size_t repSeqId = SIZE_T_MAX; unsigned int prevHitId; char buffer[100]; @@ -146,8 +146,8 @@ void KmerSearch::writeResult(DBWriter & dbw, KmerPosition *kmers, size_t kmerCou } } -template void KmerSearch::writeResult<0>(DBWriter & dbw, KmerPosition *kmers, size_t kmerCount); -template void KmerSearch::writeResult<1>(DBWriter & dbw, KmerPosition *kmers, size_t kmerCount); +template void KmerSearch::writeResult<0>(DBWriter & dbw, KmerPosition *kmers, size_t kmerCount); +template void KmerSearch::writeResult<1>(DBWriter & dbw, KmerPosition *kmers, size_t kmerCount); int kmersearch(int argc, const char **argv, const Command &command) { Parameters &par = Parameters::getInstance(); @@ -219,7 +219,7 @@ int kmersearch(int argc, const char **argv, const Command &command) { } size_t totalKmers = computeKmerCount(queryDbr, KMER_SIZE, chooseTopKmer); - size_t totalSizeNeeded = computeMemoryNeededLinearfilter(totalKmers); + size_t totalSizeNeeded = computeMemoryNeededLinearfilter(totalKmers); Debug(Debug::INFO) << "Estimated memory consumption " << totalSizeNeeded/1024/1024 << " MB\n"; BaseMatrix *subMat; @@ -269,7 +269,7 @@ int kmersearch(int argc, const char **argv, const Command &command) { KMER_SIZE, chooseTopKmer, par.pickNbest, isAdjustedKmerLen); - std::pair result; + std::pair *, size_t> result; if (Parameters::isEqualDbtype(queryDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)) { result = KmerSearch::searchInIndex(sortedKmers.kmers, sortedKmers.kmerCount, kmerIndex); @@ -278,7 +278,7 @@ int kmersearch(int argc, const char **argv, const Command &command) { sortedKmers.kmerCount, kmerIndex); } - KmerPosition *kmers = result.first; + KmerPosition *kmers = result.first; size_t kmerCount = result.second; if (splits == 1) { DBWriter dbw(tmpFiles.first.c_str(), tmpFiles.second.c_str(), 1, par.compressed, outDbType); @@ -291,10 +291,10 @@ int kmersearch(int argc, const char **argv, const Command &command) { dbw.close(); } else { if (Parameters::isEqualDbtype(queryDbr.getDbtype(), Parameters::DBTYPE_NUCLEOTIDES)) { - writeKmersToDisk(tmpFiles.first, kmers, - kmerCount ); + writeKmersToDisk(tmpFiles.first, kmers, + kmerCount ); } else { - writeKmersToDisk(tmpFiles.first, kmers, kmerCount ); + writeKmersToDisk(tmpFiles.first, kmers, kmerCount ); } } delete[] kmers; @@ -322,7 +322,7 @@ int kmersearch(int argc, const char **argv, const Command &command) { return EXIT_SUCCESS; } template -std::pair KmerSearch::searchInIndex( KmerPosition *kmers, size_t kmersSize, KmerIndex &kmerIndex) { +std::pair *,size_t > KmerSearch::searchInIndex( KmerPosition *kmers, size_t kmersSize, KmerIndex &kmerIndex) { Timer timer; kmerIndex.reset(); @@ -345,7 +345,7 @@ std::pair KmerSearch::searchInIndex( KmerPosition *kmers size_t targetKmer; while(isDone == false){ - KmerPosition * currQueryKmer = &kmers[kmerPos]; + KmerPosition * currQueryKmer = &kmers[kmerPos]; if(TYPE == Parameters::DBTYPE_NUCLEOTIDES) { queryKmer = BIT_SET(currQueryKmer->kmer, 63); targetKmer = BIT_SET(currTargetKmer.kmer, 63); @@ -362,7 +362,7 @@ std::pair KmerSearch::searchInIndex( KmerPosition *kmers isDone = true; break; } - KmerPosition * currQueryKmer = &kmers[kmerPos]; + KmerPosition * currQueryKmer = &kmers[kmerPos]; if(TYPE == Parameters::DBTYPE_NUCLEOTIDES) { queryKmer = BIT_SET(currQueryKmer->kmer, 63); }else{ @@ -448,16 +448,16 @@ std::pair KmerSearch::searchInIndex( KmerPosition *kmers Debug(Debug::INFO) << "Time to find k-mers: " << timer.lap() << "\n"; timer.reset(); if(TYPE == Parameters::DBTYPE_NUCLEOTIDES) { - omptl::sort(kmers, kmers + writePos, KmerPosition::compareRepSequenceAndIdAndDiagReverse); + omptl::sort(kmers, kmers + writePos, KmerPosition::compareRepSequenceAndIdAndDiagReverse); }else{ - omptl::sort(kmers, kmers + writePos, KmerPosition::compareRepSequenceAndIdAndDiag); + omptl::sort(kmers, kmers + writePos, KmerPosition::compareRepSequenceAndIdAndDiag); } Debug(Debug::INFO) << "Time to sort: " << timer.lap() << "\n"; return std::make_pair(kmers, writePos); } -template std::pair KmerSearch::searchInIndex<0>( KmerPosition *kmers, size_t kmersSize, KmerIndex &kmerIndex); -template std::pair KmerSearch::searchInIndex<1>( KmerPosition *kmers, size_t kmersSize, KmerIndex &kmerIndex); +template std::pair *,size_t > KmerSearch::searchInIndex<0>( KmerPosition *kmers, size_t kmersSize, KmerIndex &kmerIndex); +template std::pair *,size_t > KmerSearch::searchInIndex<1>( KmerPosition *kmers, size_t kmersSize, KmerIndex &kmerIndex); #undef SIZE_T_MAX diff --git a/src/linclust/kmersearch.h b/src/linclust/kmersearch.h index d1fe3fe..cbdef04 100644 --- a/src/linclust/kmersearch.h +++ b/src/linclust/kmersearch.h @@ -12,17 +12,17 @@ class KmerSearch{ public: template - static std::pair searchInIndex( KmerPosition *kmers, size_t kmersSize, KmerIndex &kmerIndex); + static std::pair *,size_t > searchInIndex( KmerPosition *kmers, size_t kmersSize, KmerIndex &kmerIndex); template - static void writeResult(DBWriter & dbw, KmerPosition *kmers, size_t kmerCount); + static void writeResult(DBWriter & dbw, KmerPosition *kmers, size_t kmerCount); struct ExtractKmerAndSortResult{ - ExtractKmerAndSortResult(size_t kmerCount, KmerPosition * kmers, size_t adjustedKmer) + ExtractKmerAndSortResult(size_t kmerCount, KmerPosition * kmers, size_t adjustedKmer) : kmerCount(kmerCount), kmers(kmers), adjustedKmer(adjustedKmer) {} size_t kmerCount; - KmerPosition * kmers; + KmerPosition * kmers; size_t adjustedKmer; }; static ExtractKmerAndSortResult extractKmerAndSort(size_t splitKmerCount, size_t split, size_t splits, diff --git a/src/multihit/MultiHitDb.cpp b/src/multihit/MultiHitDb.cpp index bbeb4a5..c5f9054 100644 --- a/src/multihit/MultiHitDb.cpp +++ b/src/multihit/MultiHitDb.cpp @@ -52,7 +52,6 @@ int multihitdb(int argc, const char **argv, const Command &command) { cmd.addVariable("REMOVE_TMP", "TRUE"); } - par.splitSeqByLen = false; cmd.addVariable("CREATEDB_PAR", par.createParameterString(par.createdb).c_str()); cmd.addVariable("EXTRACTORFS_PAR", par.createParameterString(par.extractorfs).c_str()); cmd.addVariable("TRANSLATENUCS_PAR", par.createParameterString(par.translatenucs).c_str()); diff --git a/src/multihit/MultiHitSearch.cpp b/src/multihit/MultiHitSearch.cpp index daf6a07..54c6043 100644 --- a/src/multihit/MultiHitSearch.cpp +++ b/src/multihit/MultiHitSearch.cpp @@ -77,7 +77,6 @@ int multihitsearch(int argc, const char **argv, const Command &command) { if (par.removeTmpFiles) { cmd.addVariable("REMOVE_TMP", "TRUE"); } - par.splitSeqByLen = false; cmd.addVariable("SEARCH_PAR", par.createParameterString(par.searchworkflow).c_str()); cmd.addVariable("BESTHITBYSET_PAR", par.createParameterString(par.besthitbyset).c_str()); cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); diff --git a/src/multihit/besthitperset.cpp b/src/multihit/besthitperset.cpp index 2ec8598..41ab7cc 100644 --- a/src/multihit/besthitperset.cpp +++ b/src/multihit/besthitperset.cpp @@ -27,9 +27,6 @@ public : void prepareInput(unsigned int, unsigned int) {} std::string aggregateEntry(std::vector> &dataToAggregate, unsigned int, unsigned int targetSetKey, unsigned int thread_idx) { - std::string buffer; - buffer.reserve(1024); - double bestScore = -DBL_MAX; double secondBestScore = -DBL_MAX; double bestEval = DBL_MAX; @@ -94,9 +91,12 @@ public : } if (bestEntry == NULL) { - return buffer; + return ""; } - + + std::string buffer; + buffer.reserve(1024); + // Aggregate the full line into string for (size_t i = 0; i < bestEntry->size(); ++i) { if (i == 1) { @@ -106,7 +106,9 @@ public : } else { buffer.append(bestEntry->at(i)); } - buffer.append("\t"); + if (i != (bestEntry->size() - 1)) { + buffer.append(1, '\t'); + } } return buffer; diff --git a/src/prefiltering/IndexTable.h b/src/prefiltering/IndexTable.h index 59d774d..f6970f1 100644 --- a/src/prefiltering/IndexTable.h +++ b/src/prefiltering/IndexTable.h @@ -94,7 +94,6 @@ class IndexTable { // count k-mers in the sequence, so enough memory for the sequence lists can be allocated in the end size_t addSimilarKmerCount(Sequence* s, KmerGenerator* kmerGenerator){ - s->resetCurrPos(); std::vector seqKmerPosBuffer; @@ -247,11 +246,9 @@ class IndexTable { this->entries = new(std::nothrow) IndexEntryLocal[tableEntriesNum]; Util::checkAllocation(entries, "Can not allocate " + SSTR(tableEntriesNum * sizeof(IndexEntryLocal)) + " bytes for entries in IndexTable::initMemory"); - memcpy(this->entries, entryOffsets, tableEntriesNum * sizeof(IndexEntryLocal)); + memcpy(this->entries, entries, tableEntriesNum * sizeof(IndexEntryLocal)); - offsets = new(std::nothrow) size_t[tableSize + 1]; - Util::checkAllocation(offsets, "Can not allocate " + SSTR((tableSize +1) * sizeof(size_t)) + " bytes for offsets in IndexTable::initMemory"); - memcpy(offsets, entryOffsets, (tableSize + 1) * sizeof(size_t)); + memcpy(this->offsets, entryOffsets, (tableSize + 1) * sizeof(size_t)); } void revertPointer() { diff --git a/src/prefiltering/Prefiltering.cpp b/src/prefiltering/Prefiltering.cpp index 06b8738..6f67162 100644 --- a/src/prefiltering/Prefiltering.cpp +++ b/src/prefiltering/Prefiltering.cpp @@ -9,8 +9,7 @@ #include "Timer.h" #include "ByteParser.h" #include "Parameters.h" -#include - +#include "MemoryMapped.h" namespace prefilter { #include "ExpOpt3_8_polished.cs32.lib.h" @@ -245,10 +244,12 @@ Prefiltering::~Prefiltering() { tdbr->close(); delete tdbr; - if (templateDBIsIndex == true && preloadMode != Parameters::PRELOAD_MODE_FREAD) { + if (templateDBIsIndex == true) { tidxdbr->close(); delete tidxdbr; - } else { + } + + if (templateDBIsIndex == false || preloadMode == Parameters::PRELOAD_MODE_FREAD) { ExtendedSubstitutionMatrix::freeScoreMatrix(_3merSubMatrix); ExtendedSubstitutionMatrix::freeScoreMatrix(_2merSubMatrix); } @@ -386,118 +387,137 @@ void Prefiltering::setupSplit(DBReader& tdbr, const int alphabetSi } } -void Prefiltering::mergeTargetSplits(const std::string &outDB, const std::string &outDBIndex, const std::vector> &fileNames) { - if (fileNames.size() < 2) { +void Prefiltering::mergeTargetSplits(const std::string &outDB, const std::string &outDBIndex, const std::vector> &fileNames, unsigned int threads) { + // we assume that the hits are in the same order + const size_t splits = fileNames.size(); + + if (splits < 2) { DBReader::moveDb(fileNames[0].first, outDB); Debug(Debug::INFO) << "No merging needed.\n"; return; } - // we assume that the hits are in the same order Timer timer; - // merge the index files and find the largest entry - size_t largestEntrySize = 0; - Debug(Debug::INFO) << "Merging " << fileNames.size() << " target splits into " << outDB << "\n"; + Debug(Debug::INFO) << "Merging " << splits << " target splits to " << FileUtil::baseName(outDB) << "\n"; DBReader reader1(fileNames[0].first.c_str(), fileNames[0].second.c_str(), 1, DBReader::USE_INDEX); reader1.open(DBReader::NOSORT); DBReader::Index *index1 = reader1.getIndex(); - for (size_t i = 1; i < fileNames.size(); i++) { + size_t totalSize = 0; + for (size_t id = 0; id < reader1.getSize(); id++) { + totalSize += index1[id].length; + } + for (size_t i = 1; i < splits; ++i) { DBReader reader2(fileNames[i].first.c_str(), fileNames[i].second.c_str(), 1, DBReader::USE_INDEX); reader2.open(DBReader::NOSORT); DBReader::Index *index2 = reader2.getIndex(); size_t currOffset = 0; - for (size_t id = 0; id < reader1.getSize(); id++) { // add length for file1 and file2 and subtract -1 for one null byte size_t seqLen = index1[id].length + index2[id].length - 1; + totalSize += index2[id].length - 1; index1[id].length = seqLen; index1[id].offset = currOffset; currOffset += seqLen; + } + reader2.close(); + } + reader1.setDataSize(totalSize); + + size_t *starts = new size_t[threads]; + size_t *lengths = new size_t[threads]; + size_t **offsetStart = new size_t*[threads]; + for (size_t i = 0; i < threads; ++i) { + reader1.decomposeDomainByAminoAcid(i, threads, &starts[i], &lengths[i]); + offsetStart[i] = new size_t[splits]; + } - // keep track of longest - if (largestEntrySize < seqLen) { - largestEntrySize = seqLen; - } + for (size_t s = 0; s < splits; ++s) { + DBReader reader2(fileNames[s].first.c_str(), fileNames[s].second.c_str(), 1, DBReader::USE_INDEX); + reader2.open(DBReader::NOSORT); + DBReader::Index *index2 = reader2.getIndex(); + for (size_t t = 0; t < threads; t++) { + offsetStart[t][s] = index2[starts[t]].offset; } reader2.close(); } - FILE *outDBIndexFILE = FileUtil::openAndDelete(outDBIndex.c_str(), "w"); - DBWriter::writeIndex(outDBIndexFILE, reader1.getSize(), index1); - fclose(outDBIndexFILE); - reader1.close(); - + Debug(Debug::INFO) << "Preparing offsets for merging: " << timer.lap() << "\n"; // merge target splits data files and sort the hits at the same time - FILE ** files = new FILE*[fileNames.size()]; - for (size_t i = 0; i < fileNames.size();i++) { - files[i] = FileUtil::openFileOrDie(fileNames[i].first.c_str(), "r", true); -#if HAVE_POSIX_FADVISE - int status; - if ((status = posix_fadvise (fileno(files[i]), 0, 0, POSIX_FADV_SEQUENTIAL)) != 0){ - Debug(Debug::ERROR) << "posix_fadvise returned an error: " << strerror(status) << "\n"; - } + // TODO: compressed? + DBWriter writer(outDB.c_str(), outDBIndex.c_str(), threads, 0, Parameters::DBTYPE_PREFILTER_RES); + writer.open(); + + Debug::Progress pregress(reader1.getSize()); +#pragma omp parallel num_threads(threads) + { + unsigned int thread_idx = 0; +#ifdef OPENMP + thread_idx = static_cast(omp_get_thread_num()); #endif - } - - int c1 = EOF; - char * hitsBuffer = new char[largestEntrySize + 1]; - size_t writePos = 0; - - std::string result; - result.reserve(largestEntrySize + 1); - - FILE * outDBFILE = FileUtil::openAndDelete(outDB.c_str(), "w"); - do { - // go over all files - for (size_t i = 0; i < fileNames.size(); ++i) { - // collect everything until the next entry - while ((c1 = getc_unlocked(files[i])) != EOF) { - if (c1 == '\0') { - break; + + std::string result; + result.reserve(1024); + + std::vector hits; + hits.reserve(300); + + char buffer[1024]; + + size_t id = starts[thread_idx]; + size_t lastId = id + lengths[thread_idx]; + FILE** files = new FILE*[splits]; + for (size_t i = 0; i < splits; ++i) { + files[i] = fopen(fileNames[i].first.c_str(), "rb"); + fseek(files[i], offsetStart[thread_idx][i], SEEK_SET); + } + + while (id < lastId) { + pregress.updateProgress(); + for (size_t i = 0; i < splits; ++i) { + int c1 = EOF; + size_t pos = 0; + while ((c1 = getc_unlocked(files[i])) != EOF) { + buffer[pos++] = (char)c1; + if (c1 == '\n') { + hits.emplace_back(QueryMatcher::parsePrefilterHit(buffer)); + pos = 0; + } else if (c1 == '\0') { + break; + } } - hitsBuffer[writePos] = (char) c1; - writePos++; } - } - // collected all hits, separated by \n. add null byte to assure correct parsing of hitsBuffer - hitsBuffer[writePos] = '\0'; - writePos++; - // parse hitsBuffer as hits and sort them by score - - std::vector hits = QueryMatcher::parsePrefilterHits(hitsBuffer); - if (hits.size() > 1) { - std::sort(hits.begin(), hits.end(), hit_t::compareHitsByScoreAndId); - } - for (size_t hit_id = 0; hit_id < hits.size(); hit_id++) { - int len = QueryMatcher::prefilterHitToBuffer(hitsBuffer, hits[hit_id]); - result.append(hitsBuffer, len); - } - result.append(1, '\0'); - // write sorted hits - size_t written = fwrite(result.c_str(), sizeof(char), result.size(), outDBFILE); - result.clear(); - if (written != writePos) { - Debug(Debug::ERROR) << "Cannot write to data file " << outDB << "\n"; - EXIT(EXIT_FAILURE); + if (hits.size() > 1) { + std::sort(hits.begin(), hits.end(), hit_t::compareHitsByScoreAndId); + } + for (size_t i = 0; i < hits.size(); ++i) { + int len = QueryMatcher::prefilterHitToBuffer(buffer, hits[i]); + result.append(buffer, len); + } + writer.writeData(result.c_str(), result.size(), reader1.getDbKey(id), thread_idx); + hits.clear(); + result.clear(); + id++; } - // reset the buffer position - writePos = 0; - } while (c1 != EOF); - fclose(outDBFILE); - delete[] hitsBuffer; - // write output dbtype - DBWriter::writeDbtypeFile(outDB.c_str(), Parameters::DBTYPE_PREFILTER_RES, compressed); + for (size_t i = 0; i < splits; ++i) { + fclose(files[i]); + } + delete[] files; + delete[] offsetStart[thread_idx]; + } + writer.close(); + reader1.close(); - for (size_t i = 0; i < fileNames.size(); ++i) { - fclose(files[i]); + for (size_t i = 0; i < splits; ++i) { DBReader::removeDb(fileNames[i].first); } - delete[] files; + delete[] offsetStart; + delete[] lengths; + delete[] starts; - Debug(Debug::INFO) << "\nTime for merging target splits: " << timer.lap() << "\n"; + Debug(Debug::INFO) << "Time for merging target splits: " << timer.lap() << "\n"; } @@ -697,6 +717,19 @@ int Prefiltering::runSplits(const std::string &resultDB, const std::string &resu } if (splitFiles.size() > 0) { mergePrefilterSplits(resultDB, resultDBIndex, splitFiles); + if (splitFiles.size() > 1) { + DBReader resultReader(resultDB.c_str(), resultDBIndex.c_str(), threads, DBReader::USE_INDEX | DBReader::USE_DATA); + resultReader.open(DBReader::NOSORT); + resultReader.readMmapedDataInMemory(); + const std::pair tempDb = Util::databaseNames(resultDB + "_tmp"); + DBWriter resultWriter(tempDb.first.c_str(), tempDb.second.c_str(), threads, compressed, Parameters::DBTYPE_PREFILTER_RES); + resultWriter.open(); + resultWriter.sortDatafileByIdOrder(resultReader); + resultWriter.close(true); + resultReader.close(); + DBReader::removeDb(resultDB); + DBReader::moveDb(tempDb.first, resultDB); + } hasResult = true; } } else if (splitProcessCount == 1) { @@ -974,7 +1007,7 @@ BaseMatrix *Prefiltering::getSubstitutionMatrix(const ScoreMatrixFile &scoringMa void Prefiltering::mergePrefilterSplits(const std::string &outDB, const std::string &outDBIndex, const std::vector> &splitFiles) { if (splitMode == Parameters::TARGET_DB_SPLIT) { - mergeTargetSplits(outDB, outDBIndex, splitFiles); + mergeTargetSplits(outDB, outDBIndex, splitFiles, threads); } else if (splitMode == Parameters::QUERY_DB_SPLIT) { DBWriter::mergeResults(outDB, outDBIndex, splitFiles); } diff --git a/src/prefiltering/Prefiltering.h b/src/prefiltering/Prefiltering.h index 60626b8..be599f8 100644 --- a/src/prefiltering/Prefiltering.h +++ b/src/prefiltering/Prefiltering.h @@ -48,6 +48,9 @@ class Prefiltering { static int getKmerThreshold(const float sensitivity, const bool isProfile, const int kmerScore, const int kmerSize); + static void mergeTargetSplits(const std::string &outDB, const std::string &outDBIndex, + const std::vector> &fileNames, unsigned int threads); + private: const std::string queryDB; const std::string queryDBIndex; @@ -120,8 +123,6 @@ class Prefiltering { void printStatistics(const statistics_t &stats, std::list **reslens, unsigned int resLensSize, size_t empty, size_t maxResults); - void mergeTargetSplits(const std::string &outDB, const std::string &outDBIndex, const std::vector> &fileNames); - bool isSameQTDB(); void reopenTargetDb(); diff --git a/src/prefiltering/PrefilteringIndexReader.cpp b/src/prefiltering/PrefilteringIndexReader.cpp index 83484e9..6fa2965 100644 --- a/src/prefiltering/PrefilteringIndexReader.cpp +++ b/src/prefiltering/PrefilteringIndexReader.cpp @@ -343,7 +343,7 @@ SequenceLookup *PrefilteringIndexReader::getSequenceLookup(unsigned int split, D if (preloadMode == Parameters::PRELOAD_MODE_FREAD) { SequenceLookup *sequenceLookup = new SequenceLookup(sequenceCount, seqDataSize); - sequenceLookup->initLookupByExternalDataCopy(seqData, seqDataSize, (size_t *) seqOffsetsData); + sequenceLookup->initLookupByExternalDataCopy(seqData, (size_t *) seqOffsetsData); return sequenceLookup; } @@ -384,7 +384,7 @@ IndexTable *PrefilteringIndexReader::getIndexTable(unsigned int split, DBReader< } if (preloadMode == Parameters::PRELOAD_MODE_FREAD) { - IndexTable* table = new IndexTable(adjustAlphabetSize, data.kmerSize, true); + IndexTable* table = new IndexTable(adjustAlphabetSize, data.kmerSize, false); table->initTableByExternalDataCopy(sequenceCount, entriesNum, (IndexEntryLocal*) entriesData, (size_t *)entriesOffsetsData); return table; } diff --git a/src/prefiltering/QueryMatcher.cpp b/src/prefiltering/QueryMatcher.cpp index 96b0140..e20d567 100644 --- a/src/prefiltering/QueryMatcher.cpp +++ b/src/prefiltering/QueryMatcher.cpp @@ -153,6 +153,7 @@ std::pair QueryMatcher::matchQuery (Sequence * querySeq, unsign } }else{ unsigned int thr = computeScoreThreshold(scoreSizes, this->maxHitsPerQuery); + thr = std::max(minDiagScoreThr, thr); if(resultSize < counterResultSize/2) { int elementsCntAboveDiagonalThr = radixSortByScoreSize(scoreSizes, foundDiagonals + resultSize, thr, foundDiagonals, resultSize); queryResult = getResult(foundDiagonals + resultSize, elementsCntAboveDiagonalThr, identityId, thr, ungappedAlignment, false); diff --git a/src/prefiltering/SequenceLookup.cpp b/src/prefiltering/SequenceLookup.cpp index 9c7b834..8fca556 100644 --- a/src/prefiltering/SequenceLookup.cpp +++ b/src/prefiltering/SequenceLookup.cpp @@ -72,14 +72,7 @@ void SequenceLookup::initLookupByExternalData(char *seqData, size_t seqDataSize, offsets = seqOffsets; } -void SequenceLookup::initLookupByExternalDataCopy(char *seqData, size_t seqDataSize, size_t *seqOffsets) { - dataSize = seqDataSize; - - data = new(std::nothrow) char[dataSize + 1]; - Util::checkAllocation(data, "Can not allocate data memory in SequenceLookup"); +void SequenceLookup::initLookupByExternalDataCopy(char *seqData, size_t *seqOffsets) { memcpy(data, seqData, (dataSize + 1) * sizeof(char)); - - offsets = new(std::nothrow) size_t[sequenceCount + 1]; - Util::checkAllocation(offsets, "Can not allocate offsets memory in SequenceLookup"); memcpy(offsets, seqOffsets, (sequenceCount + 1) * sizeof(size_t)); } diff --git a/src/prefiltering/SequenceLookup.h b/src/prefiltering/SequenceLookup.h index 9eb4008..b54079d 100644 --- a/src/prefiltering/SequenceLookup.h +++ b/src/prefiltering/SequenceLookup.h @@ -33,7 +33,7 @@ class SequenceLookup { size_t *getOffsets(); void initLookupByExternalData(char *seqData, size_t dataSize, size_t *seqOffsets); - void initLookupByExternalDataCopy(char *seqData, size_t dataSize, size_t *seqOffsets); + void initLookupByExternalDataCopy(char *seqData, size_t *seqOffsets); private: size_t sequenceCount; diff --git a/src/taxonomy/NcbiTaxonomy.cpp b/src/taxonomy/NcbiTaxonomy.cpp index aa20b7a..8cda330 100644 --- a/src/taxonomy/NcbiTaxonomy.cpp +++ b/src/taxonomy/NcbiTaxonomy.cpp @@ -300,8 +300,7 @@ TaxID NcbiTaxonomy::LCA(TaxID taxonA, TaxID taxonB) const { } else if (!nodeExists(taxonB)) { return taxonA; } - - return taxonNodes[lcaHelper(D[nodeId(taxonA)], D[nodeId(taxonB)])].taxId; + return taxonNodes[lcaHelper(nodeId(taxonA), nodeId(taxonB))].taxId; } diff --git a/src/taxonomy/lca.cpp b/src/taxonomy/lca.cpp index a230e4a..ff02835 100644 --- a/src/taxonomy/lca.cpp +++ b/src/taxonomy/lca.cpp @@ -35,7 +35,7 @@ int lca(int argc, const char **argv, const Command& command) { DBWriter writer(par.db3.c_str(), par.db3Index.c_str(), par.threads, par.compressed, Parameters::DBTYPE_TAXONOMICAL_RESULT); writer.open(); - std::vector ranks = Util::split(par.lcaRanks, ":"); + std::vector ranks = Util::split(par.lcaRanks, ";"); // a few NCBI taxa are blacklisted by default, they contain unclassified sequences (e.g. metagenomes) or other sequences (e.g. plasmids) // if we do not remove those, a lot of sequences would be classified as Root, even though they have a sensible LCA @@ -126,7 +126,7 @@ int lca(int argc, const char **argv, const Command& command) { resultData = SSTR(node->taxId) + '\t' + node->rank + '\t' + node->name; if (!ranks.empty()) { - std::string lcaRanks = Util::implode(t->AtRanks(node, ranks), ':'); + std::string lcaRanks = Util::implode(t->AtRanks(node, ranks), ';'); resultData += '\t' + lcaRanks; } if (par.showTaxLineage) { diff --git a/src/taxonomy/taxonomyreport.cpp b/src/taxonomy/taxonomyreport.cpp index 8f3e65a..3c1cc8b 100644 --- a/src/taxonomy/taxonomyreport.cpp +++ b/src/taxonomy/taxonomyreport.cpp @@ -4,6 +4,8 @@ #include "FileUtil.h" #include "Debug.h" #include "Util.h" +#include "krona_prelude.html.h" + #include #include @@ -35,12 +37,7 @@ unsigned int cladeCountVal(const std::unordered_map& map, Ta } -void taxReport(FILE* FP, - const NcbiTaxonomy& taxDB, - const std::unordered_map & cladeCounts, - unsigned long totalReads, - TaxID taxID = 0, int depth = 0) { - +void taxReport(FILE* FP, const NcbiTaxonomy& taxDB, const std::unordered_map & cladeCounts,unsigned long totalReads,TaxID taxID = 0, int depth = 0) { std::unordered_map::const_iterator it = cladeCounts.find(taxID); unsigned int cladeCount = it == cladeCounts.end()? 0 : it->second.cladeCount; unsigned int taxCount = it == cladeCounts.end()? 0 : it->second.taxCount; @@ -72,6 +69,66 @@ void taxReport(FILE* FP, } } +std::string escapeAttribute(const std::string& data) { + std::string buffer; + buffer.reserve(data.size() * 1.1); + for (size_t i = 0; i < data.size(); ++i) { + switch (data[i]) { + case '&': + buffer.append("&"); + break; + case '\"': + buffer.append("""); + break; + case '\'': + buffer.append("'"); + break; + case '<': + buffer.append("<"); + break; + case '>': + buffer.append(">"); + break; + default: + buffer.append(1, data[i]); + break; + } + } + return buffer; +} + +void kronaReport(FILE* FP, const NcbiTaxonomy& taxDB, const std::unordered_map & cladeCounts,unsigned long totalReads,TaxID taxID = 0, int depth = 0) { + std::unordered_map::const_iterator it = cladeCounts.find(taxID); + unsigned int cladeCount = it == cladeCounts.end()? 0 : it->second.cladeCount; +// unsigned int taxCount = it == cladeCounts.end()? 0 : it->second.taxCount; + if (cladeCount == 0) { + return; + } + if (taxID == 0) { + if (cladeCount > 0) { + fprintf(FP, "%d", cladeCount); + } + kronaReport(FP, taxDB, cladeCounts, totalReads, 1); + } else { + if (cladeCount == 0) { + return; + } + const TaxonNode* taxon = taxDB.taxonNode(taxID); + std::string escapedName = escapeAttribute(taxon->name); + fprintf(FP, "%d", escapedName.c_str(), cladeCount); + std::vector children = it->second.children; + std::sort(children.begin(), children.end(), [&](int a, int b) { return cladeCountVal(cladeCounts, a) > cladeCountVal(cladeCounts,b); }); + for (TaxID childTaxId : children) { + if (cladeCounts.count(childTaxId)) { + kronaReport(FP, taxDB, cladeCounts, totalReads, childTaxId, depth + 1); + } else { + break; + } + } + fprintf(FP, ""); + } +} + int taxonomyreport(int argc, const char **argv, const Command& command) { Parameters& par = Parameters::getInstance(); par.parseParameters(argc, argv, command, true, 0, 0); @@ -89,7 +146,7 @@ int taxonomyreport(int argc, const char **argv, const Command& command) { std::stable_sort(mapping.begin(), mapping.end(), compareToFirstInt); } - DBReader reader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader::USE_DATA|DBReader::USE_INDEX); + DBReader reader(par.db2.c_str(), par.db2Index.c_str(), 1, DBReader::USE_DATA|DBReader::USE_INDEX); reader.open(DBReader::LINEAR_ACCCESS); // TODO: Better way to get file specified by param3? @@ -134,8 +191,16 @@ int taxonomyreport(int argc, const char **argv, const Command& command) { Debug(Debug::INFO) << taxCounts.at(0) << " reads are unclassified.\n"; std::unordered_map cladeCounts = taxDB->getCladeCounts(taxCounts); - taxReport(resultFP, *taxDB, cladeCounts, reader.getSize()); + if (par.reportMode == 0) { + taxReport(resultFP, *taxDB, cladeCounts, reader.getSize()); + } else { + fwrite(krona_prelude_html, krona_prelude_html_len, sizeof(char), resultFP); + fprintf(resultFP, "%zu", reader.getSize()); + kronaReport(resultFP, *taxDB, cladeCounts, reader.getSize()); + fprintf(resultFP, "
"); + } delete taxDB; reader.close(); return EXIT_SUCCESS; } + diff --git a/src/test/TestDBReader.cpp b/src/test/TestDBReader.cpp index e0827ad..c22a5bc 100644 --- a/src/test/TestDBReader.cpp +++ b/src/test/TestDBReader.cpp @@ -1,12 +1,9 @@ #include #include #include -#include - +#include #include "Clustering.h" -#include "SetElement.h" - #include "DBReader.h" #include "DBWriter.h" diff --git a/src/test/TestDBReaderZstd.cpp b/src/test/TestDBReaderZstd.cpp index 406dd96..a1e4b00 100644 --- a/src/test/TestDBReaderZstd.cpp +++ b/src/test/TestDBReaderZstd.cpp @@ -7,8 +7,6 @@ #include #include "Clustering.h" -#include "SetElement.h" - #include "DBReader.h" #include "DBWriter.h" #include "Parameters.h" diff --git a/src/test/TestDiagonalScoring.cpp b/src/test/TestDiagonalScoring.cpp index 25f4b64..ecd4501 100644 --- a/src/test/TestDiagonalScoring.cpp +++ b/src/test/TestDiagonalScoring.cpp @@ -9,11 +9,8 @@ #include "ExtendedSubstitutionMatrix.h" #include "Clustering.h" -#include "SetElement.h" - #include "DBReader.h" #include "DBWriter.h" - #include "Parameters.h" const char* binary_name = "test_diagonalscoring"; diff --git a/src/test/TestDiagonalScoringPerformance.cpp b/src/test/TestDiagonalScoringPerformance.cpp index a740260..8671a2e 100644 --- a/src/test/TestDiagonalScoringPerformance.cpp +++ b/src/test/TestDiagonalScoringPerformance.cpp @@ -15,11 +15,8 @@ KSEQ_INIT(int, read) #include "Clustering.h" -#include "SetElement.h" - #include "DBReader.h" #include "DBWriter.h" - #include "Parameters.h" const char* binary_name = "test_diagonalscoringperformance"; diff --git a/src/test/TestSequenceIndex.cpp b/src/test/TestSequenceIndex.cpp index 2654bff..883b22c 100644 --- a/src/test/TestSequenceIndex.cpp +++ b/src/test/TestSequenceIndex.cpp @@ -6,7 +6,6 @@ #include "SequenceLookup.h" #include "SubstitutionMatrix.h" #include "Clustering.h" -#include "SetElement.h" #include "DBReader.h" #include "DBWriter.h" #include "Parameters.h" diff --git a/src/util/apply.cpp b/src/util/apply.cpp index bd67555..7d5d1b0 100644 --- a/src/util/apply.cpp +++ b/src/util/apply.cpp @@ -284,7 +284,6 @@ int apply(int argc, const char **argv, const Command& command) { DBReader reader(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader::USE_DATA|DBReader::USE_INDEX); reader.open(DBReader::SORT_BY_LENGTH); - const DBReader::Index *readerIndex = reader.getIndex(); Debug::Progress progress(reader.getSize()); #ifdef HAVE_MPI @@ -340,22 +339,21 @@ int apply(int argc, const char **argv, const Command& command) { continue; } - size_t index = i; - size_t size = readerIndex[i].length - 1; - - char *data = reader.getData(index, thread); - if (data == NULL) { + unsigned int key = reader.getDbKey(i); + char *data = reader.getData(i, thread); + if (*data == '\0') { + writer.writeData(NULL, 0, key, thread); continue; } - unsigned int key = reader.getDbKey(index); + size_t size = reader.getEntryLen(i) - 1; int status = apply_by_entry(data, size, key, writer, par.restArgv[0], const_cast(par.restArgv), local_environ, 0); if (status == -1) { - Debug(Debug::WARNING) << "Entry " << index << " system error " << errno << "!\n"; + Debug(Debug::WARNING) << "Entry " << key << " system error number " << errno << "!\n"; continue; } if (status > 0) { - Debug(Debug::WARNING) << "Entry " << index << " exited with error code " << status << "!\n"; + Debug(Debug::WARNING) << "Entry " << key << " exited with error code " << status << "!\n"; continue; } } diff --git a/src/util/convert2fasta.cpp b/src/util/convert2fasta.cpp index 897ded0..8d9ab68 100644 --- a/src/util/convert2fasta.cpp +++ b/src/util/convert2fasta.cpp @@ -12,7 +12,7 @@ #include "Util.h" #include "FileUtil.h" -const char header_start[] = {'>'}; +const char headerStart[] = {'>'}; const char newline[] = {'\n'}; int convert2fasta(int argc, const char **argv, const Command& command) { @@ -40,15 +40,18 @@ int convert2fasta(int argc, const char **argv, const Command& command) { Debug(Debug::INFO) << "Start writing file to " << par.db2 << "\n"; for(size_t i = 0; i < from->getSize(); i++){ unsigned int key = from->getDbKey(i); + unsigned int headerKey = db_header.getId(key); + const char* headerData = db_header.getData(headerKey, 0); + const size_t headerLen = db_header.getEntryLen(headerKey); - const char* header_data = db_header.getDataByDBKey(key, 0); - - fwrite(header_start, sizeof(char), 1, fastaFP); - fwrite(header_data, sizeof(char), strlen(header_data) - 1, fastaFP); + fwrite(headerStart, sizeof(char), 1, fastaFP); + fwrite(headerData, sizeof(char), headerLen - 2, fastaFP); fwrite(newline, sizeof(char), 1, fastaFP); - const char* body_data = db.getDataByDBKey(key, 0); - fwrite(body_data, sizeof(char), strlen(body_data) - 1, fastaFP); + unsigned int bodyKey = db.getId(key); + const char* bodyData = db.getData(bodyKey, 0); + const size_t bodyLen = db.getEntryLen(bodyKey); + fwrite(bodyData, sizeof(char), bodyLen - 2, fastaFP); fwrite(newline, sizeof(char), 1, fastaFP); } diff --git a/src/util/convertalignments.cpp b/src/util/convertalignments.cpp index 1b1cc73..4d34ecb 100644 --- a/src/util/convertalignments.cpp +++ b/src/util/convertalignments.cpp @@ -10,11 +10,15 @@ #include "Sequence.h" #include "Orf.h" #include "MemoryMapped.h" +#include "NcbiTaxonomy.h" + +#include #ifdef OPENMP #include #endif + void printSeqBasedOnAln(std::string &out, const char *seq, unsigned int offset, const std::string &bt, bool reverse, bool isReverseStrand, bool translateSequence, const TranslateNucl &translateNucl) { @@ -131,6 +135,9 @@ std::map readSetToSource(const std::string& file) { return mapping; } +static bool compareToFirstInt(const std::pair& lhs, const std::pair& rhs){ + return (lhs.first <= rhs.first); +} int convertalignments(int argc, const char **argv, const Command &command) { Parameters &par = Parameters::getInstance(); @@ -145,11 +152,31 @@ int convertalignments(int argc, const char **argv, const Command &command) { bool needFullHeaders = false; bool needLookup = false; bool needSource = false; - const std::vector outcodes = Parameters::getOutputFormat(par.outfmt, needSequenceDB, needBacktrace, needFullHeaders, needLookup, needSource); + bool needTaxonomy = false; + bool needTaxonomyMapping = false; + const std::vector outcodes = Parameters::getOutputFormat(par.outfmt, needSequenceDB, needBacktrace, needFullHeaders, + needLookup, needSource, needTaxonomyMapping, needTaxonomy); if(format == Parameters::FORMAT_ALIGNMENT_SAM){ needSequenceDB = true; needBacktrace = true; } + + NcbiTaxonomy * t = NULL; + std::vector> mapping; + if(needTaxonomy){ + t = NcbiTaxonomy::openTaxonomy(par.db2); + } + if(needTaxonomy || needTaxonomyMapping){ + if(FileUtil::fileExists(std::string(par.db2 + "_mapping").c_str()) == false){ + Debug(Debug::ERROR) << par.db2 + "_mapping" << " does not exist. Please create the taxonomy mapping!\n"; + EXIT(EXIT_FAILURE); + } + bool isSorted = Util::readMapping( par.db2 + "_mapping", mapping); + if(isSorted == false){ + std::stable_sort(mapping.begin(), mapping.end(), compareToFirstInt); + } + } + bool isTranslatedSearch = false; int dbaccessMode = needSequenceDB ? (DBReader::USE_INDEX | DBReader::USE_DATA) : (DBReader::USE_INDEX); @@ -157,17 +184,19 @@ int convertalignments(int argc, const char **argv, const Command &command) { std::map qKeyToSet; std::map tKeyToSet; if (needLookup) { - std::string file = par.db1 + ".lookup"; - qKeyToSet = readKeyToSet(file); - tKeyToSet = readKeyToSet(file); + std::string file1 = par.db1 + ".lookup"; + std::string file2 = par.db2 + ".lookup"; + qKeyToSet = readKeyToSet(file1); + tKeyToSet = readKeyToSet(file2); } std::map qSetToSource; std::map tSetToSource; if (needSource) { - std::string file = par.db1 + ".source"; - qSetToSource = readSetToSource(file); - tSetToSource = readSetToSource(file); + std::string file1 = par.db1 + ".source"; + std::string file2 = par.db2 + ".source"; + qSetToSource = readSetToSource(file1); + tSetToSource = readSetToSource(file2); } IndexReader qDbr(par.db1, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); @@ -183,8 +212,8 @@ int convertalignments(int argc, const char **argv, const Command &command) { tDbrHeader = new IndexReader(par.db2, par.threads, IndexReader::SRC_HEADERS, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0); } - bool queryNucs = Parameters::isEqualDbtype(qDbr.sequenceReader->getDbtype(), Parameters::DBTYPE_NUCLEOTIDES); - bool targetNucs = Parameters::isEqualDbtype(tDbr->sequenceReader->getDbtype(), Parameters::DBTYPE_NUCLEOTIDES); + const bool queryNucs = Parameters::isEqualDbtype(qDbr.sequenceReader->getDbtype(), Parameters::DBTYPE_NUCLEOTIDES); + const bool targetNucs = Parameters::isEqualDbtype(tDbr->sequenceReader->getDbtype(), Parameters::DBTYPE_NUCLEOTIDES); if (needSequenceDB) { // try to figure out if search was translated. This is can not be solved perfectly. bool seqtargetAA = false; @@ -241,7 +270,6 @@ int convertalignments(int argc, const char **argv, const Command &command) { const bool isDb = par.dbOut; TranslateNucl translateNucl(static_cast(par.translationTable)); - if (format == Parameters::FORMAT_ALIGNMENT_SAM) { char buffer[1024]; unsigned int lastKey = tDbr->sequenceReader->getLastKey(); @@ -307,6 +335,7 @@ int convertalignments(int argc, const char **argv, const Command &command) { std::string newBacktrace; newBacktrace.reserve(1024); + const TaxonNode * taxonNode = NULL; #pragma omp for schedule(dynamic, 10) for (size_t i = 0; i < alnDbr.getSize(); i++) { @@ -409,6 +438,24 @@ int convertalignments(int argc, const char **argv, const Command &command) { } else { char *targetSeqData = NULL; targetProfData.clear(); + unsigned int taxon = 0; + + if(needTaxonomy || needTaxonomyMapping) { + std::pair val; + val.first = res.dbKey; + std::vector>::iterator mappingIt; + mappingIt = std::upper_bound(mapping.begin(), mapping.end(), val, compareToFirstInt); + if (mappingIt == mapping.end() || mappingIt->first != val.first) { + taxon = 0; + }else{ + taxon = mappingIt->second; + if(needTaxonomy){ + taxonNode = t->taxonNode(taxon, false); + } + } + + } + if (needSequenceDB) { size_t tId = tDbr->sequenceReader->getId(res.dbKey); targetSeqData = tDbr->sequenceReader->getData(tId, thread_idx); @@ -464,10 +511,6 @@ int convertalignments(int argc, const char **argv, const Command &command) { result.append(SSTR(res.score)); break; case Parameters::OUTFMT_CIGAR: - if(isTranslatedSearch == true && targetNucs == true && queryNucs == true ){ - Matcher::result_t::protein2nucl(res.backtrace, newBacktrace); - res.backtrace = newBacktrace; - } result.append(SSTR(res.backtrace)); newBacktrace.clear(); break; @@ -537,6 +580,15 @@ int convertalignments(int argc, const char **argv, const Command &command) { case Parameters::OUTFMT_TSETID: result.append(SSTR(tKeyToSet[res.dbKey])); break; + case Parameters::OUTFMT_TAXID: + result.append(SSTR(taxon)); + break; + case Parameters::OUTFMT_TAXNAME: + result.append((taxonNode != NULL) ? taxonNode->name : "unclassified"); + break; + case Parameters::OUTFMT_TAXLIN: + result.append((taxonNode != NULL) ? t->taxLineage(taxonNode) : "unclassified"); + break; case Parameters::OUTFMT_EMPTY: result.push_back('-'); break; @@ -579,14 +631,7 @@ int convertalignments(int argc, const char **argv, const Command &command) { continue; } result.append(buffer, count); - if (isTranslatedSearch == true && targetNucs == true && queryNucs == true) { - Matcher::result_t::protein2nucl(res.backtrace, newBacktrace); - result.append(newBacktrace); - newBacktrace.clear(); - - } else { - result.append(res.backtrace); - } + result.append(res.backtrace); result.append("\t*\t0\t0\t"); int start = std::min(res.qStartPos, res.qEndPos); int end = std::max(res.qStartPos, res.qEndPos); @@ -642,7 +687,9 @@ int convertalignments(int argc, const char **argv, const Command &command) { if (isDb == false) { FileUtil::remove(par.db4Index.c_str()); } - + if(needTaxonomy){ + delete t; + } alnDbr.close(); if (sameDB == false) { delete tDbr; diff --git a/src/util/convertmsa.cpp b/src/util/convertmsa.cpp index e5d2d76..52bc61b 100644 --- a/src/util/convertmsa.cpp +++ b/src/util/convertmsa.cpp @@ -5,6 +5,7 @@ #include #include +#include #ifdef HAVE_ZLIB #include "gzstream.h" diff --git a/src/util/createdb.cpp b/src/util/createdb.cpp index cf9593e..0a1156a 100644 --- a/src/util/createdb.cpp +++ b/src/util/createdb.cpp @@ -16,10 +16,6 @@ int createdb(int argc, const char **argv, const Command& command) { Parameters &par = Parameters::getInstance(); par.parseParameters(argc, argv, command, true, Parameters::PARSE_VARIADIC, 0); - if (par.maxSeqLen == Parameters::MAX_SEQ_LEN) { - par.maxSeqLen = Parameters::MAX_SEQ_LEN - 1; - } - std::vector filenames(par.filenames); std::string dataFile = filenames.back(); filenames.pop_back(); @@ -61,10 +57,6 @@ int createdb(int argc, const char **argv, const Command& command) { } } if (isNuclCnt) { - if (isNuclDb == false) { - Debug(Debug::WARNING) << "Assuming DNA database, forcing parameter --dont-split-seq-by-len true\n"; - par.splitSeqByLen = false; - } isNuclDb = true; } } @@ -77,14 +69,20 @@ int createdb(int argc, const char **argv, const Command& command) { } std::string indexFile = dataFile + ".index"; + if (par.createdbMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT && par.shuffleDatabase) { + Debug(Debug::WARNING) << "Shuffle database can not be combined with --createdb-mode 0.\n"; + Debug(Debug::WARNING) << "We recompute with --shuffle 0.\n"; + par.shuffleDatabase = false; + } const unsigned int shuffleSplits = par.shuffleDatabase ? 32 : 1; - DBWriter seqWriter(dataFile.c_str(), indexFile.c_str(), shuffleSplits, par.compressed, dbType); - seqWriter.open(); + if (par.createdbMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT && par.compressed) { + Debug(Debug::WARNING) << "Compressed database can not be combined with --createdb-mode 0.\n"; + Debug(Debug::WARNING) << "We recompute with --compressed 0.\n"; + par.compressed = 0; + } std::string hdrDataFile = dataFile + "_h"; std::string hdrIndexFile = dataFile + "_h.index"; - DBWriter hdrWriter(hdrDataFile.c_str(), hdrIndexFile.c_str(), shuffleSplits, par.compressed, Parameters::DBTYPE_GENERIC_DB); - hdrWriter.open(); unsigned int entries_num = 0; size_t count = 0; @@ -94,8 +92,6 @@ int createdb(int argc, const char **argv, const Command& command) { const size_t testForNucSequence = 100; size_t isNuclCnt = 0; - std::string dataStr; - dataStr.reserve(1000000); Debug::Progress progress; std::vector* sourceLookup = new std::vector[shuffleSplits](); for (size_t i = 0; i < shuffleSplits; ++i) { @@ -104,20 +100,21 @@ int createdb(int argc, const char **argv, const Command& command) { Debug(Debug::INFO) << "Converting sequences\n"; std::string sourceFile = dataFile + ".source"; + + redoComputation: FILE *source = fopen(sourceFile.c_str(), "w"); if (source == NULL) { Debug(Debug::ERROR) << "Can not open " << sourceFile << " for writing!\n"; EXIT(EXIT_FAILURE); } - + DBWriter hdrWriter(hdrDataFile.c_str(), hdrIndexFile.c_str(), shuffleSplits, par.compressed, Parameters::DBTYPE_GENERIC_DB); + hdrWriter.open(); + DBWriter seqWriter(dataFile.c_str(), indexFile.c_str(), shuffleSplits, par.compressed, dbType); + seqWriter.open(); for (size_t fileIdx = 0; fileIdx < filenames.size(); fileIdx++) { unsigned int numEntriesInCurrFile = 0; - std::string splitHeader; - splitHeader.reserve(1024); std::string header; header.reserve(1024); - std::string splitId; - splitId.reserve(1024); char buffer[4096]; size_t len = snprintf(buffer, sizeof(buffer), "%zu\t%s\n", fileIdx, FileUtil::baseName(filenames[fileIdx]).c_str()); @@ -136,11 +133,6 @@ int createdb(int argc, const char **argv, const Command& command) { EXIT(EXIT_FAILURE); } - size_t splitCnt = 1; - if (par.splitSeqByLen == true) { - splitCnt = (size_t) ceilf(static_cast(e.sequence.l) / static_cast(par.maxSeqLen)); - } - // header header.append(e.name.s, e.name.l); if (e.comment.l > 0) { @@ -148,102 +140,83 @@ int createdb(int argc, const char **argv, const Command& command) { header.append(e.comment.s, e.comment.l); } - std::string headerId = Util::parseFastaHeader(header); + std::string headerId = Util::parseFastaHeader(header.c_str()); if (headerId.empty()) { // An identifier is necessary for these two cases, so we should just give up Debug(Debug::WARNING) << "Can not extract identifier from entry " << entries_num << ".\n"; - } - for (size_t split = 0; split < splitCnt; split++) { - unsigned int id = par.identifierOffset + entries_num; - if (par.dbType == 0) { - // check for the first 10 sequences if they are nucleotide sequences - if (count < 10 || (count % 100) == 0) { - if (sampleCount < testForNucSequence) { - size_t cnt = 0; - for (size_t i = 0; i < e.sequence.l; i++) { - switch (toupper(e.sequence.s[i])) { - case 'T': - case 'A': - case 'G': - case 'C': - case 'U': - case 'N': - cnt++; - break; - } - } - const float nuclDNAFraction = static_cast(cnt) / static_cast(e.sequence.l); - if (nuclDNAFraction > 0.9) { - isNuclCnt += true; + unsigned int id = par.identifierOffset + entries_num; + if (par.dbType == 0) { + // check for the first 10 sequences if they are nucleotide sequences + if (count < 10 || (count % 100) == 0) { + if (sampleCount < testForNucSequence) { + size_t cnt = 0; + for (size_t i = 0; i < e.sequence.l; i++) { + switch (toupper(e.sequence.s[i])) { + case 'T': + case 'A': + case 'G': + case 'C': + case 'U': + case 'N': + cnt++; + break; } } - sampleCount++; - } - if (isNuclCnt == sampleCount || isNuclCnt == testForNucSequence) { - if (isNuclDb == false) { - Debug(Debug::WARNING) << "Assume it is a DNA database.\n"; - Debug(Debug::WARNING) << "Set parameter --dont-split-seq-by-len\n"; - par.splitSeqByLen = false; - splitCnt = 1; + const float nuclDNAFraction = static_cast(cnt) / static_cast(e.sequence.l); + if (nuclDNAFraction > 0.9) { + isNuclCnt += true; } - isNuclDb = true; - } else if (isNuclDb == true && isNuclCnt != sampleCount) { - Debug(Debug::ERROR) << "Database does not look like a DNA database anymore. Sorry our prediction went wrong.\n"; - Debug(Debug::ERROR) << "Please recompute with --dbtype 1 flag.\n"; - EXIT(EXIT_FAILURE); } + sampleCount++; } - - splitId.append(headerId); - if (splitCnt > 1) { - splitId.append("_"); - splitId.append(SSTR(split)); + bool redoComp = false; + if (isNuclCnt == sampleCount || isNuclCnt == testForNucSequence) { + isNuclDb = true; + } else if (isNuclDb == true && isNuclCnt != sampleCount) { + Debug(Debug::WARNING) << "Database does not look like a DNA database anymore.\n"; + Debug(Debug::WARNING) << "We recompute as protein database.\n"; + dbType = Parameters::DBTYPE_AMINO_ACIDS; + redoComp = true; } - // For split entries replace the found identifier by identifier_splitNumber - // Also add another hint that it was split to the end of the header - splitHeader.append(header); - if (par.splitSeqByLen == true && splitCnt > 1) { - if (headerId.empty() == false) { - size_t pos = splitHeader.find(headerId); - if (pos != std::string::npos) { - splitHeader.erase(pos, headerId.length()); - splitHeader.insert(pos, splitId); - } - } - splitHeader.append(" Split="); - splitHeader.append(SSTR(split)); + if(par.createdbMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT && e.multiline == true){ + Debug(Debug::WARNING) << "Multiline fasta can not be combined with --createdb-mode 0.\n"; + Debug(Debug::WARNING) << "We recompute with --createdb-mode 1.\n"; + par.createdbMode = Parameters::SEQUENCE_SPLIT_MODE_HARD; + redoComp = true; } - - // space is needed for later parsing - splitHeader.append(" ", 1); - splitHeader.append("\n"); - - // Finally write down the entry - unsigned int splitIdx = id % shuffleSplits; - sourceLookup[splitIdx].emplace_back(fileIdx); - - hdrWriter.writeData(splitHeader.c_str(), splitHeader.length(), id, splitIdx); - - if (par.splitSeqByLen) { - size_t len = std::min(par.maxSeqLen, e.sequence.l - split * par.maxSeqLen); - seqWriter.writeStart(splitIdx); - seqWriter.writeAdd(e.sequence.s + split * par.maxSeqLen, len, splitIdx); - seqWriter.writeAdd(&newline, 1, splitIdx); - seqWriter.writeEnd(id, splitIdx, true); - } else { - seqWriter.writeStart(splitIdx); - seqWriter.writeAdd(e.sequence.s, e.sequence.l, splitIdx); - seqWriter.writeAdd(&newline, 1, splitIdx); - seqWriter.writeEnd(id, splitIdx, true); + if (redoComp) { + progress.reset(SIZE_MAX); + hdrWriter.close(); + seqWriter.close(); + delete kseq; + fclose(source); + for (size_t i = 0; i < shuffleSplits; ++i) { + sourceLookup[i].clear(); + } + goto redoComputation; } - splitHeader.clear(); - splitId.clear(); + } - entries_num++; - numEntriesInCurrFile++; - count++; + // Finally write down the entry + unsigned int splitIdx = id % shuffleSplits; + sourceLookup[splitIdx].emplace_back(fileIdx); + header.push_back('\n'); + if(par.createdbMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT){ + // +2 to emulate the \n\0 + hdrWriter.writeIndexEntry(id, e.offset, header.size()+2, 0); + seqWriter.writeIndexEntry(id, e.offset + header.size(), e.sequence.l+2, 0); + }else{ + hdrWriter.writeData(header.c_str(), header.length(), id, splitIdx); + seqWriter.writeStart(splitIdx); + seqWriter.writeAdd(e.sequence.s, e.sequence.l, splitIdx); + seqWriter.writeAdd(&newline, 1, splitIdx); + seqWriter.writeEnd(id, splitIdx, true); } + + entries_num++; + numEntriesInCurrFile++; + count++; header.clear(); } delete kseq; @@ -268,7 +241,14 @@ int createdb(int argc, const char **argv, const Command& command) { // fix ids DBWriter::createRenumberedDB(dataFile, indexFile, "", DBReader::LINEAR_ACCCESS); DBWriter::createRenumberedDB(hdrDataFile, hdrIndexFile, "", DBReader::LINEAR_ACCCESS); - + if(par.createdbMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT) { + for (size_t fileIdx = 0; fileIdx < filenames.size(); fileIdx++) { + if(par.sequenceSplitMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT){ + FileUtil::symlinkAbs(filenames[0], dataFile+"."+SSTR(fileIdx)); + FileUtil::symlinkAbs(filenames[0], hdrDataFile+"."+SSTR(fileIdx)); + } + } + } DBReader readerHeader(hdrDataFile.c_str(), hdrIndexFile.c_str(), 1, DBReader::USE_DATA | DBReader::USE_INDEX); readerHeader.open(DBReader::NOSORT); diff --git a/src/util/createseqfiledb.cpp b/src/util/createseqfiledb.cpp index 5f2d442..f0c90f6 100644 --- a/src/util/createseqfiledb.cpp +++ b/src/util/createseqfiledb.cpp @@ -37,7 +37,7 @@ int createseqfiledb(int argc, const char **argv, const Command& command) { #endif #pragma omp for schedule(dynamic, 100) for (size_t i = 0; i < numClusters; ++i){ - std::ostringstream fastaStream; + std::string resultStr; char* data = clusters.getData(i, thread_idx); size_t entries = Util::countLines(data, clusters.getEntryLen(i) - 1); @@ -50,6 +50,7 @@ int createseqfiledb(int argc, const char **argv, const Command& command) { size_t entries_num = 0; char dbKey[255 + 1]; while (std::getline(clusterEntries, entry)) { + resultStr.clear(); entries_num++; Util::parseKey((char*)entry.c_str(), dbKey); const unsigned int entryId = (unsigned int) strtoul(dbKey, NULL, 10); @@ -65,20 +66,31 @@ int createseqfiledb(int argc, const char **argv, const Command& command) { Debug(Debug::WARNING) << "Entry " << entry << " does not contain a sequence!" << "\n"; continue; } + size_t lineLen = Util::skipLine(header) - header; + std::string headerStr(header, lineLen); + lineLen = Util::skipLine(body) - body; + std::string bodyStr(body, lineLen); if (entries_num == 1 && par.hhFormat) { - std::string consensusHeader(header); - fastaStream << "#" << header - << ">" << Util::removeAfterFirstSpace(consensusHeader) << "_consensus\n" << body - << ">" << header << body; + std::string consensusHeader(headerStr); + resultStr.push_back('#'); + resultStr.append(headerStr); + resultStr.push_back('>'); + resultStr.append(Util::removeAfterFirstSpace(consensusHeader)); + resultStr.append("_consensus\n"); + resultStr.append(bodyStr); + resultStr.push_back('>'); + resultStr.append(headerStr); + resultStr.append(bodyStr); } else { - fastaStream << ">" << header << body; + resultStr.push_back('>'); + resultStr.append(headerStr); + resultStr.append(bodyStr); } } - std::string fasta = fastaStream.str(); unsigned int key = clusters.getDbKey(i); - msaOut.writeData(fasta.c_str(), fasta.length(), key, thread_idx); + msaOut.writeData(resultStr.c_str(), resultStr.length(), key, thread_idx); } }; diff --git a/src/util/diffseqdbs.cpp b/src/util/diffseqdbs.cpp index 5e3ae4e..a268bdb 100644 --- a/src/util/diffseqdbs.cpp +++ b/src/util/diffseqdbs.cpp @@ -91,13 +91,11 @@ int diffseqdbs(int argc, const char **argv, const Command &command) { if (par.useSequenceId) { keysNew[id] = std::make_pair( Util::parseFastaHeader(newReader.getData(id,thread_idx)), - newReader.getDbKey(id) - ); + newReader.getDbKey(id)); } else { keysNew[id] = std::make_pair( Util::removeWhiteSpace(newReader.getData(id, thread_idx)), - newReader.getDbKey(id) - ); + newReader.getDbKey(id)); } } } diff --git a/src/util/extractdomains.cpp b/src/util/extractdomains.cpp index 7507f84..0f978a7 100644 --- a/src/util/extractdomains.cpp +++ b/src/util/extractdomains.cpp @@ -134,7 +134,7 @@ std::vector mapMsa(const std::string &msa, const std::vector &do continue; } - std::string name = Util::parseFastaHeader(fullName); + std::string name = Util::parseFastaHeader(fullName.c_str()); if (seq->comment.l > 0) { std::string comment(seq->comment.s); size_t start = comment.find("Split="); diff --git a/src/util/indexdb.cpp b/src/util/indexdb.cpp index d6008c7..154d079 100644 --- a/src/util/indexdb.cpp +++ b/src/util/indexdb.cpp @@ -44,6 +44,8 @@ int indexdb(int argc, const char **argv, const Command &command) { par.parseParameters(argc, argv, command, true, 0, 0); const bool sameDB = (par.db1 == par.db2); + + std::string path = FileUtil::getRealPathFromSymLink(par.db2dbtype); DBReader dbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); dbr.open(DBReader::NOSORT); diff --git a/src/util/offsetalignment.cpp b/src/util/offsetalignment.cpp index dcb57ac..a6ac056 100644 --- a/src/util/offsetalignment.cpp +++ b/src/util/offsetalignment.cpp @@ -100,12 +100,12 @@ void updateOffset(char* data, std::vector &results, const Orf Orf::SequenceLocation tloc = Orf::parseOrfHeader(header); res.dbKey = (tloc.id != UINT_MAX) ? tloc.id : res.dbKey; - size_t from = (tloc.id != UINT_MAX) ? tloc.from : (tloc.strand == Orf::STRAND_MINUS) ? 0 : res.dbLen - 1; + size_t from = (tloc.id != UINT_MAX) ? tloc.from : (tloc.strand == Orf::STRAND_MINUS) ? res.dbLen - 1 : 0; int dbStartPos = isNucleotideSearch ? res.dbStartPos : res.dbStartPos * 3; int dbEndPos = isNucleotideSearch ? res.dbEndPos : res.dbEndPos * 3; - if (tloc.strand == Orf::STRAND_MINUS && tloc.id != UINT_MAX) { + if (tloc.strand == Orf::STRAND_MINUS) { res.dbStartPos = from - dbStartPos; res.dbEndPos = from - dbEndPos; // account for last orf @@ -351,6 +351,9 @@ int offsetalignment(int argc, const char **argv, const Command &command) { results.reserve(300); tmp.reserve(300); + std::string newBacktrace; + newBacktrace.reserve(300); + #pragma omp for schedule(dynamic, 10) for (size_t i = 0; i < entryCount; ++i) { progress.updateProgress(); @@ -389,6 +392,12 @@ int offsetalignment(int argc, const char **argv, const Command &command) { for(size_t i = 0; i < results.size(); i++) { Matcher::result_t &res = results[i]; bool hasBacktrace = (res.backtrace.size() > 0); + if (isNuclNuclSearch == false && hasBacktrace) { + newBacktrace.reserve(res.backtrace.length() * 3); + Matcher::result_t::protein2nucl(res.backtrace, newBacktrace); + res.backtrace = newBacktrace; + newBacktrace.clear(); + } size_t len = Matcher::resultToBuffer(buffer, res, hasBacktrace, false); ss.append(buffer, len); } @@ -414,6 +423,12 @@ int offsetalignment(int argc, const char **argv, const Command &command) { for(size_t i = 0; i < results.size(); i++){ Matcher::result_t &res = results[i]; bool hasBacktrace = (res.backtrace.size() > 0); + if (isNuclNuclSearch == false && hasBacktrace) { + newBacktrace.reserve(res.backtrace.length() * 3); + Matcher::result_t::protein2nucl(res.backtrace, newBacktrace); + res.backtrace = newBacktrace; + newBacktrace.clear(); + } size_t len = Matcher::resultToBuffer(buffer, res, hasBacktrace, false); ss.append(buffer, len); } @@ -423,6 +438,12 @@ int offsetalignment(int argc, const char **argv, const Command &command) { for(size_t i = 0; i < tmp.size(); i++){ Matcher::result_t &res = tmp[i]; bool hasBacktrace = (res.backtrace.size() > 0); + if (isNuclNuclSearch == false && hasBacktrace) { + newBacktrace.reserve(res.backtrace.length() * 3); + Matcher::result_t::protein2nucl(res.backtrace, newBacktrace); + res.backtrace = newBacktrace; + newBacktrace.clear(); + } size_t len = Matcher::resultToBuffer(buffer, res, hasBacktrace, false); ss.append(buffer, len); } diff --git a/src/util/prefixid.cpp b/src/util/prefixid.cpp index 8cde893..e9c8af3 100644 --- a/src/util/prefixid.cpp +++ b/src/util/prefixid.cpp @@ -9,7 +9,7 @@ #endif -int addid(const std::string &db1, const std::string &db1Index, const std::string &db2, const std::string &db2Index, +int addid(const std::string &db1, const std::string &db1Index, const std::string &db2, const std::string &db2Index, const bool tsvOut, const std::string &mappingFile, const std::string &userStrToAdd, const bool isPrefix, const int threads, const int compressed) { DBReader reader(db1.c_str(), db1Index.c_str(), threads, DBReader::USE_INDEX|DBReader::USE_DATA); reader.open(DBReader::LINEAR_ACCCESS); @@ -23,9 +23,13 @@ const bool tsvOut, const std::string &mappingFile, const std::string &userStrToA size_t entries = reader.getSize(); Debug::Progress progress(entries); + bool doMapping = false; + DBReader * lookupReader=NULL; + if(mappingFile.size() > 0){ + lookupReader = new DBReader(mappingFile.c_str(), mappingFile.c_str(), 1, DBReader::USE_LOOKUP); + doMapping = true; + } - DBReader lookupReader(mappingFile.c_str(), mappingFile.c_str(), 1, DBReader::USE_LOOKUP); - const bool doMapping = lookupReader.getLookupSize() > 0; #pragma omp parallel { unsigned int thread_idx = 0; @@ -47,12 +51,12 @@ const bool tsvOut, const std::string &mappingFile, const std::string &userStrToA if (userStrToAdd != "") { strToAdd = userStrToAdd; } else if (doMapping) { - size_t lookupId = lookupReader.getLookupIdByKey(key); + size_t lookupId = lookupReader->getLookupIdByKey(key); if (lookupId == SIZE_MAX) { Debug(Debug::ERROR) << "Could not find key " << key << " in lookup\n"; EXIT(EXIT_FAILURE); } - strToAdd = lookupReader.getLookupEntryName(lookupId); + strToAdd = lookupReader->getLookupEntryName(lookupId); } else { strToAdd = SSTR(key); } @@ -73,7 +77,9 @@ const bool tsvOut, const std::string &mappingFile, const std::string &userStrToA FileUtil::remove(writer.getIndexFileName()); } reader.close(); - + if(doMapping){ + delete lookupReader; + } return EXIT_SUCCESS; } diff --git a/src/util/result2flat.cpp b/src/util/result2flat.cpp index 4d2b347..e553a8e 100644 --- a/src/util/result2flat.cpp +++ b/src/util/result2flat.cpp @@ -36,7 +36,9 @@ int result2flat(int argc, const char **argv, const Command &command) { std::string headerStr; if (par.useHeader == true){ - headerStr = header_data; + size_t lineLen = Util::skipLine(header_data) - header_data; + headerStr.assign(header_data, lineLen); + if (headerStr.length() > 0) { if (headerStr[headerStr.length() - 1] == '\n') { headerStr[headerStr.length() - 1] = ' '; diff --git a/src/util/result2msa.cpp b/src/util/result2msa.cpp index fb47ecf..76f744c 100644 --- a/src/util/result2msa.cpp +++ b/src/util/result2msa.cpp @@ -123,13 +123,14 @@ int result2msa(Parameters &par, const std::string &resultData, const std::string // Get the sequence from the queryDB unsigned int queryKey = resultReader.getDbKey(id); - char *seqData = qDbr.getDataByDBKey(queryKey, thread_idx); - if (seqData == NULL) { - Debug(Debug::WARNING) << "Empty sequence " << id << ". Skipping.\n"; + size_t queryId = qDbr.getId(queryKey); + if (queryId == UINT_MAX) { + Debug(Debug::WARNING) << "Invalid query sequence " << queryKey << ".\n"; continue; } + centerSequence.mapSequence(id, queryKey, qDbr.getData(queryId, thread_idx), qDbr.getSeqLen(queryId)); - centerSequence.mapSequence(0, queryKey, seqData, resultReader.getSeqLen(id)); + // TODO: Do we still need this? if (centerSequence.L) { if(centerSequence.int_sequence[centerSequence.L-1] == 20) // remove last in it is a * diff --git a/src/util/result2stats.cpp b/src/util/result2stats.cpp index 3c8bc9c..6bf7393 100644 --- a/src/util/result2stats.cpp +++ b/src/util/result2stats.cpp @@ -1,6 +1,5 @@ #include "result2stats.h" -#include "Alignment.h" #include "AminoAcidLookupTables.h" #include "Debug.h" @@ -9,8 +8,6 @@ #include "itoa.h" #include "Parameters.h" -#include - #ifdef OPENMP #include #endif @@ -233,15 +230,33 @@ int StatsComputer::sumValue() { return 0; } +float averageValueOnAminoAcids(const std::unordered_map &values, const char *seq) { + const char *seqPointer = seq; + float ret = values.at('0') + values.at('1'); // C ter and N ter values + std::unordered_map::const_iterator k; + + while (*seqPointer != '\0' && *seqPointer != '\n') { + if ((k = values.find(tolower(*seqPointer))) != values.end()) { + ret += k->second; + } + + seqPointer++; + } + + size_t seqLen = seqPointer - seq; + return ret / std::max(static_cast(1), seqLen); +} + + float doolittle(const char *seq) { static Doolittle doolitle; - return Util::averageValueOnAminoAcids(doolitle.values, seq); + return averageValueOnAminoAcids(doolitle.values, seq); } float charges(const char *seq) { static Charges charges; - return Util::averageValueOnAminoAcids(charges.values, seq); + return averageValueOnAminoAcids(charges.values, seq); } std::string firstline(const char *seq) { diff --git a/src/util/splitsequence.cpp b/src/util/splitsequence.cpp index 18a5a63..a1c7b2c 100644 --- a/src/util/splitsequence.cpp +++ b/src/util/splitsequence.cpp @@ -42,6 +42,11 @@ int splitsequence(int argc, const char **argv, const Command& command) { DBReader headerReader(par.hdr1.c_str(), par.hdr1Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); headerReader.open(DBReader::NOSORT); + if(par.sequenceSplitMode == Parameters::SEQUENCE_SPLIT_MODE_SOFT && par.compressed == true) { + Debug(Debug::WARNING) << "Sequence split mode (--sequence-split-mode 0) and compressed (--compressed 1) can not be combined.\nTurn compressed to 0"; + par.compressed = 0; + } + DBWriter sequenceWriter(par.db2.c_str(), par.db2Index.c_str(), par.threads, par.compressed, reader.getDbtype()); sequenceWriter.open(); diff --git a/src/util/subtractdbs.cpp b/src/util/subtractdbs.cpp index a4ed63a..a1cd98b 100644 --- a/src/util/subtractdbs.cpp +++ b/src/util/subtractdbs.cpp @@ -12,6 +12,8 @@ #include "Util.h" #include "Parameters.h" +#include + #ifdef OPENMP #include #endif diff --git a/src/util/summarizetabs.cpp b/src/util/summarizetabs.cpp index 1d69df4..21d34e3 100644 --- a/src/util/summarizetabs.cpp +++ b/src/util/summarizetabs.cpp @@ -9,6 +9,7 @@ #include #include +#include #ifdef OPENMP #include diff --git a/src/workflow/EasyCluster.cpp b/src/workflow/EasyCluster.cpp index dd73f63..5883f1d 100644 --- a/src/workflow/EasyCluster.cpp +++ b/src/workflow/EasyCluster.cpp @@ -14,6 +14,7 @@ void setEasyClusterDefaults(Parameters *p) { p->removeTmpFiles = true; p->covThr = 0.8; p->evalThr = 0.001; + p->createdbMode = Parameters::SEQUENCE_SPLIT_MODE_SOFT; p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; p->maxResListLen = 20; } diff --git a/src/workflow/EasyLinclust.cpp b/src/workflow/EasyLinclust.cpp index eb831f8..9571f5c 100644 --- a/src/workflow/EasyLinclust.cpp +++ b/src/workflow/EasyLinclust.cpp @@ -15,6 +15,7 @@ void setEasyLinclustDefaults(Parameters *p) { p->removeTmpFiles = true; p->covThr = 0.8; p->evalThr = 0.001; + p->createdbMode = Parameters::SEQUENCE_SPLIT_MODE_SOFT; //p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV_SEQID; p->orfStartMode = 1; p->orfMinLength = 10; diff --git a/src/workflow/EasySearch.cpp b/src/workflow/EasySearch.cpp index 52d26dd..b36437f 100644 --- a/src/workflow/EasySearch.cpp +++ b/src/workflow/EasySearch.cpp @@ -18,7 +18,7 @@ void setEasySearchDefaults(Parameters *p, bool linsearch) { } void setEasySearchMustPassAlong(Parameters *p, bool linsearch) { if (linsearch) { - p->PARAM_DONT_SHUFFLE.wasSet = true; + p->PARAM_SHUFFLE.wasSet = true; } p->PARAM_S.wasSet = true; p->PARAM_REMOVE_TMP_FILES.wasSet = true; @@ -53,13 +53,18 @@ int doeasysearch(int argc, const char **argv, const Command &command, bool linse setEasySearchMustPassAlong(&par, linsearch); bool needBacktrace = false; + bool needTaxonomy = false; + bool needTaxonomyMapping = false; + { bool needSequenceDB = false; bool needFullHeaders = false; bool needLookup = false; bool needSource = false; - Parameters::getOutputFormat(par.outfmt, needSequenceDB, needBacktrace, needFullHeaders, needLookup, needSource); + Parameters::getOutputFormat(par.outfmt, needSequenceDB, needBacktrace, needFullHeaders, + needLookup, needSource, needTaxonomyMapping, needTaxonomy); } + if (par.formatAlignmentMode == Parameters::FORMAT_ALIGNMENT_SAM || par.greedyBestHits) { needBacktrace = true; } @@ -85,6 +90,10 @@ int doeasysearch(int argc, const char **argv, const Command &command, bool linse cmd.addVariable("TARGET", target.c_str()); par.filenames.pop_back(); + if(needTaxonomy || needTaxonomyMapping){ + Parameters::checkIfTaxDbIsComplete(target); + } + if (linsearch) { const bool isIndex = LinsearchIndexReader::searchForIndex(target).empty() == false; cmd.addVariable("INDEXEXT", isIndex ? ".linidx" : NULL); @@ -107,6 +116,8 @@ int doeasysearch(int argc, const char **argv, const Command &command, bool linse cmd.addVariable("RUNNER", par.runner.c_str()); + cmd.addVariable("CREATEDB_QUERY_PAR", par.createParameterString(par.createdb).c_str()); + par.createdbMode = Parameters::SEQUENCE_SPLIT_MODE_HARD; cmd.addVariable("CREATEDB_PAR", par.createParameterString(par.createdb).c_str()); cmd.addVariable("CONVERT_PAR", par.createParameterString(par.convertalignments).c_str()); cmd.addVariable("SUMMARIZE_PAR", par.createParameterString(par.summarizeresult).c_str()); diff --git a/src/workflow/EasyTaxonomy.cpp b/src/workflow/EasyTaxonomy.cpp index ba30cfa..d4e924e 100644 --- a/src/workflow/EasyTaxonomy.cpp +++ b/src/workflow/EasyTaxonomy.cpp @@ -9,6 +9,7 @@ void setEasyTaxonomyDefaults(Parameters *p) { p->spacedKmer = true; p->removeTmpFiles = true; p->alignmentMode = Parameters::ALIGNMENT_MODE_SCORE_COV; + p->createdbMode = Parameters::SEQUENCE_SPLIT_MODE_SOFT; p->sensitivity = 5.7; p->evalThr = 1; p->orfStartMode = 1; @@ -59,9 +60,12 @@ int easytaxonomy(int argc, const char **argv, const Command& command) { par.PARAM_TAX_OUTPUT_MODE.wasSet = true; cmd.addVariable("TAXONOMY_PAR", par.createParameterString(par.taxonomy, true).c_str()); par.alignmentMode = alignmentMode; + + cmd.addVariable("CREATEDB_QUERY_PAR", par.createParameterString(par.createdb).c_str()); cmd.addVariable("LCA_PAR", par.createParameterString(par.lca).c_str()); cmd.addVariable("CONVERT_PAR", par.createParameterString(par.convertalignments).c_str()); - cmd.addVariable("THREADS_PAR", par.createParameterString(par.threadsandcompression).c_str()); + cmd.addVariable("THREADS_COMP_PAR", par.createParameterString(par.threadsandcompression).c_str()); + cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); cmd.addVariable("CREATETSV_PAR", par.createParameterString(par.createtsv).c_str()); par.evalThr = 100000000; cmd.addVariable("SWAPRESULT_PAR", par.createParameterString(par.swapresult).c_str()); diff --git a/src/workflow/Rbh.cpp b/src/workflow/Rbh.cpp index 88bface..277cffc 100644 --- a/src/workflow/Rbh.cpp +++ b/src/workflow/Rbh.cpp @@ -59,7 +59,7 @@ int rbh(int argc, const char **argv, const Command &command) { cmd.addVariable("VERB_COMP_PAR", par.createParameterString(par.verbandcompression).c_str()); cmd.addVariable("THREADS_COMP_PAR", par.createParameterString(par.threadsandcompression).c_str()); - std::string program = par.db4 + "/rbh.sh"; + std::string program = tmpDir + "/rbh.sh"; FileUtil::writeFile(program, rbh_sh, rbh_sh_len); cmd.execProgram(program.c_str(), par.filenames); diff --git a/util/build_osx.sh b/util/build_osx.sh index 57e9409..f53d813 100755 --- a/util/build_osx.sh +++ b/util/build_osx.sh @@ -1,6 +1,20 @@ -#!/bin/bash -e -REPO="$(greadlink -f $1)" -BUILD="$(greadlink -f $2)" +#!/bin/sh -e +abspath() { + if [ -d "$1" ]; then + (cd "$1"; pwd) + elif [ -f "$1" ]; then + if [ -z "${1##*/*}" ]; then + echo "$(cd "${1%/*}"; pwd)/${1##*/}" + else + echo "$(pwd)/$1" + fi + elif [ -d "$(dirname "$1")" ]; then + echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")" + fi +} + +REPO="$(abspath "$1")" +BUILD="$(abspath "$2")" BINARY_NAME="${3:-mmseqs}" if [ ! -d "$REPO" ]; then @@ -9,19 +23,19 @@ if [ ! -d "$REPO" ]; then fi mkdir -p "$BUILD/build_sse41" && cd "$BUILD/build_sse41" -cmake -DCMAKE_BUILD_TYPE=Release -DHAVE_TESTS=0 -DHAVE_MPI=0 -DHAVE_SSE4_1=1 -DBUILD_SHARED_LIBS=OFF -DCMAKE_EXE_LINKER_FLAGS="-static-libgcc -static-libstdc++" -DCMAKE_FIND_LIBRARY_SUFFIXES=".a" "$REPO" +cmake -DCMAKE_BUILD_TYPE=Release -DHAVE_TESTS=0 -DHAVE_MPI=0 -DHAVE_SSE4_1=1 -DBUILD_SHARED_LIBS=OFF -DCMAKE_FIND_LIBRARY_SUFFIXES=".a" -DOpenMP_C_FLAGS="-Xpreprocessor -fopenmp -I/usr/local/opt/libomp/include" -DOpenMP_C_LIB_NAMES=omp -DOpenMP_CXX_FLAGS="-Xpreprocessor -fopenmp -I/usr/local/opt/libomp/include" -DOpenMP_CXX_LIB_NAMES=omp -DOpenMP_omp_LIBRARY=/usr/local/opt/libomp/lib/libomp.a "$REPO" make -j 4 -if [ "$(echo $(otool -L "src/${BINARY_NAME}" | wc -l))" != 4 ]; then +if [ "$(echo $(otool -L "src/${BINARY_NAME}" | wc -l))" != 5 ]; then echo "Too many linked libraries found in ${BINARY_NAME} binary. Build is not static!" exit 1 fi cd "$BUILD" && mkdir -p "build_avx2" && cd "build_avx2" -cmake -DCMAKE_BUILD_TYPE=Release -DHAVE_TESTS=0 -DHAVE_MPI=0 -DHAVE_AVX2=1 -DBUILD_SHARED_LIBS=OFF -DCMAKE_EXE_LINKER_FLAGS="-static-libgcc -static-libstdc++" -DCMAKE_FIND_LIBRARY_SUFFIXES=".a" "$REPO" +cmake -DCMAKE_BUILD_TYPE=Release -DHAVE_TESTS=0 -DHAVE_MPI=0 -DHAVE_AVX2=1 -DBUILD_SHARED_LIBS=OFF -DCMAKE_FIND_LIBRARY_SUFFIXES=".a" -DOpenMP_C_FLAGS="-Xpreprocessor -fopenmp -I/usr/local/opt/libomp/include" -DOpenMP_C_LIB_NAMES=omp -DOpenMP_CXX_FLAGS="-Xpreprocessor -fopenmp -I/usr/local/opt/libomp/include" -DOpenMP_CXX_LIB_NAMES=omp -DOpenMP_omp_LIBRARY=/usr/local/opt/libomp/lib/libomp.a "$REPO" make -j 4 -if [ "$(echo $(otool -L "src/${BINARY_NAME}" | wc -l))" != 4 ]; then +if [ "$(echo $(otool -L "src/${BINARY_NAME}" | wc -l))" != 5 ]; then echo "Too many linked libraries found in ${BINARY_NAME} binary. Build i not static!" exit 1 fi diff --git a/util/regression b/util/regression index c5df561..fd14d65 160000 --- a/util/regression +++ b/util/regression @@ -1 +1 @@ -Subproject commit c5df561214eccc338d7ec5bb3f80434c03ef1691 +Subproject commit fd14d653c296d5f99007653bc139a4486184d7ba