diff --git a/src/commons/KmerExtractor.cpp b/src/commons/KmerExtractor.cpp index ae6a6186..d0f88363 100644 --- a/src/commons/KmerExtractor.cpp +++ b/src/commons/KmerExtractor.cpp @@ -1,4 +1,6 @@ #include "KmerExtractor.h" +#include "common.h" +#include KmerExtractor::KmerExtractor(const LocalParameters &par) { spaceNum = par.spaceMask.length() - 8; @@ -76,7 +78,7 @@ void KmerExtractor::fillQueryKmerBufferParallel(KSeqWrapper *kseq1, processedQueryNum ++; count ++; } -#pragma omp parallel default(none), shared(par, kmerBuffer, cout, processedQueryNum, queryList, currentQueryNum, currentSplit, count, reads1) +#pragma omp parallel default(none), shared(par, kmerBuffer, cout, processedQueryNum, queryList, currentQueryNum, count, reads1) { SeqIterator seqIterator(par); size_t posToWrite; @@ -165,8 +167,8 @@ void KmerExtractor::fillQueryKmerBufferParallel_paired(KSeqWrapper *kseq1, for (size_t i = 0; i < currentQueryNum; i ++) { size_t queryIdx = processedQueryNum - currentQueryNum + i; // Get k-mer count - auto kmerCnt = LocalUtil::getQueryKmerNumber(reads1[i].length(), spaceNum); - auto kmerCnt2 = LocalUtil::getQueryKmerNumber(reads2[i].length(), spaceNum); + int kmerCnt = LocalUtil::getQueryKmerNumber(reads1[i].length(), spaceNum); + int kmerCnt2 = LocalUtil::getQueryKmerNumber(reads2[i].length(), spaceNum); // Ignore short read if (kmerCnt2 < 1 || kmerCnt < 1) { continue; } diff --git a/src/commons/KmerExtractor.h b/src/commons/KmerExtractor.h index 14226262..bac04a27 100644 --- a/src/commons/KmerExtractor.h +++ b/src/commons/KmerExtractor.h @@ -3,6 +3,8 @@ #include "SeqIterator.h" #include "QueryIndexer.h" #include "KSeqWrapper.h" +#include "common.h" +#include class KmerExtractor { private: diff --git a/src/commons/LocalUtil.cpp b/src/commons/LocalUtil.cpp index 6b87319b..ff80fc47 100644 --- a/src/commons/LocalUtil.cpp +++ b/src/commons/LocalUtil.cpp @@ -33,15 +33,7 @@ void LocalUtil::splitQueryFile(std::vector & sequences, const std delete kseq; } -int LocalUtil::getMaxCoveredLength(int queryLength) { - if (queryLength % 3 == 2) { - return queryLength - 2; - } else if (queryLength % 3 == 1) { - return queryLength - 4; - } else { - return queryLength - 3; - } -} + int LocalUtil::getFirstWhiteSpacePos(const std::string &str) { for (size_t i = 0; i < str.size(); ++i) { diff --git a/src/commons/LocalUtil.h b/src/commons/LocalUtil.h index 7f511239..7204125e 100644 --- a/src/commons/LocalUtil.h +++ b/src/commons/LocalUtil.h @@ -17,7 +17,8 @@ class LocalUtil : public Util { static void splitQueryFile(std::vector & seqSegments, const std::string & queryPath); - static int getMaxCoveredLength(int queryLength) ; + template + static T getMaxCoveredLength(T queryLength); static int getFirstWhiteSpacePos(const std::string & str); @@ -30,5 +31,15 @@ T LocalUtil::getQueryKmerNumber(T queryLength, int spaceNum) { return (getMaxCoveredLength(queryLength) / 3 - kmerLength - spaceNum + 1) * 6; } +template +T LocalUtil::getMaxCoveredLength(T queryLength) { + if (queryLength % 3 == 2) { + return queryLength - 2; + } else if (queryLength % 3 == 1) { + return queryLength - 4; + } else { + return queryLength - 3; + } +} #endif //METABULI_LOCALUTIL_H diff --git a/src/commons/QueryFilter.cpp b/src/commons/QueryFilter.cpp index 60caf658..303b3331 100644 --- a/src/commons/QueryFilter.cpp +++ b/src/commons/QueryFilter.cpp @@ -1,5 +1,6 @@ #include "QueryFilter.h" #include "common.h" +#include QueryFilter::QueryFilter(LocalParameters & par) { // Load parameters diff --git a/src/commons/QueryIndexer.cpp b/src/commons/QueryIndexer.cpp index def385d8..5c3401ed 100644 --- a/src/commons/QueryIndexer.cpp +++ b/src/commons/QueryIndexer.cpp @@ -47,12 +47,15 @@ void QueryIndexer::indexQueryFile(size_t processedQueryNum) { readNum_1++; seqCnt++; totalReadLength += kseq->entry.sequence.l; - size_t currentKmerCnt = LocalUtil::getQueryKmerNumber(kseq->entry.sequence.l, spaceNum); - kmerCnt += currentKmerCnt; - + int kmerCnt_int = LocalUtil::getQueryKmerNumber(kseq->entry.sequence.l, spaceNum); + if (kmerCnt_int > 0) { + kmerCnt += (size_t) kmerCnt_int; + } else { + continue; + } if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt) > availableRam) { - querySplits.emplace_back(start, readNum_1 - 1, kmerCnt - currentKmerCnt, seqCnt - 1); - kmerCnt = currentKmerCnt; + querySplits.emplace_back(start, readNum_1 - 1, kmerCnt - (size_t) kmerCnt_int, seqCnt - 1); + kmerCnt = (size_t) kmerCnt_int; start = readNum_1 - 1; seqCnt = 1; } @@ -77,14 +80,15 @@ void QueryIndexer::indexQueryFile(size_t processedQueryNum) { size_t seqCnt_1 = 0; size_t seqCnt_2 = 0; size_t start = 0; - size_t currentKmerCnt; bool end = false; + int kmerCnt_int_1; + int kmerCnt_int_2; while(true) { if (kseq_1->ReadEntry()) { readNum_1++; seqCnt_1++; totalReadLength += kseq_1->entry.sequence.l; - currentKmerCnt = LocalUtil::getQueryKmerNumber(kseq_1->entry.sequence.l, spaceNum); + kmerCnt_int_1 = LocalUtil::getQueryKmerNumber(kseq_1->entry.sequence.l, spaceNum); } else { end = true; } @@ -93,8 +97,7 @@ void QueryIndexer::indexQueryFile(size_t processedQueryNum) { readNum_2++; seqCnt_2++; totalReadLength += kseq_2->entry.sequence.l; - currentKmerCnt += LocalUtil::getQueryKmerNumber(kseq_2->entry.sequence.l, spaceNum); - + kmerCnt_int_2 = LocalUtil::getQueryKmerNumber(kseq_1->entry.sequence.l, spaceNum); } else { end = true; } @@ -104,10 +107,15 @@ void QueryIndexer::indexQueryFile(size_t processedQueryNum) { EXIT(EXIT_FAILURE); } - kmerCnt += currentKmerCnt; + if (kmerCnt_int_1 > 0 && kmerCnt_int_2 > 0) { + kmerCnt += (size_t) kmerCnt_int_1 + (size_t) kmerCnt_int_2; + } else { + continue; + } + if (bytesPerKmer * kmerCnt + ((size_t) 200 * seqCnt_1) > availableRam) { - querySplits.emplace_back(start, readNum_1 - 1, kmerCnt - currentKmerCnt, seqCnt_1 - 1); - kmerCnt = currentKmerCnt; + querySplits.emplace_back(start, readNum_1 - 1, kmerCnt - ((size_t) kmerCnt_int_1 + (size_t) kmerCnt_int_2), seqCnt_1 - 1); + kmerCnt = (size_t) kmerCnt_int_1 + (size_t) kmerCnt_int_2; start = readNum_1 - 1; seqCnt_1 = 1; } diff --git a/src/commons/QueryIndexer.h b/src/commons/QueryIndexer.h index d0dbf7f2..103ec4ac 100644 --- a/src/commons/QueryIndexer.h +++ b/src/commons/QueryIndexer.h @@ -7,6 +7,8 @@ #include "KSeqWrapper.h" #include "LocalUtil.h" #include "Debug.h" +#include "common.h" +#include struct QuerySplit { size_t start; diff --git a/src/commons/SeqIterator.cpp b/src/commons/SeqIterator.cpp index 6f93eaa1..b789e281 100644 --- a/src/commons/SeqIterator.cpp +++ b/src/commons/SeqIterator.cpp @@ -318,7 +318,6 @@ void SeqIterator::fillQueryKmerBuffer(const char *seq, int seqLen, QueryKmerBuff int forOrRev; uint64_t tempKmer = 0; int checkN; - for (uint8_t frame = 0; frame < 6; frame++) { uint32_t len = aaFrames[frame].size(); forOrRev = frame / 3; @@ -355,7 +354,6 @@ void SeqIterator::fillQueryKmerBuffer(const char *seq, int seqLen, QueryKmerBuff posToWrite++; } } - // cout << "posToWrite: " << posToWrite << endl; } void diff --git a/src/commons/common.h b/src/commons/common.h index 0ab08446..dc847fcb 100644 --- a/src/commons/common.h +++ b/src/commons/common.h @@ -1,5 +1,6 @@ #ifndef ADCLASSIFIER2_COMMON_H #define ADCLASSIFIER2_COMMON_H +#include #include #include "LocalParameters.h" #include "NcbiTaxonomy.h" @@ -9,6 +10,13 @@ #define unlikely(x) __builtin_expect((x),0) #define kmerLength 8 +struct KmerCnt { + KmerCnt(size_t length, size_t kmerCnt, size_t totalCnt) : length(length), kmerCnt(kmerCnt), totalCnt(totalCnt) {} + KmerCnt() : length(0), kmerCnt(0), totalCnt(0){} + size_t length; + size_t kmerCnt; + size_t totalCnt; +}; struct SequenceBlock{ SequenceBlock(size_t start, size_t end, size_t length, size_t seqLength = 0) : start(start), end(end), length(length), seqLength(seqLength) {}