From 5645e86ed854f5de327b543ed57d0c2ef1abfee6 Mon Sep 17 00:00:00 2001 From: Milot Mirdita Date: Tue, 2 Apr 2024 22:27:30 +0900 Subject: [PATCH 1/3] Squashed 'lib/mmseqs/' changes from 950342d9c6..804bb2af6d 804bb2af6d Fix --taxon-list being broken multi-threaded in prefilter and ungappedprefilter f6c98807d5 Only fetch cSeqId if its needed in expand cb714dd490 Add --db-load-mode to mergeresultsbyset abab073141 Fix test 036828397e Fix unsigned/signed warning in local comp-bias computation 9b1a04851f Fix missing newline in Profile print debug function git-subtree-dir: lib/mmseqs git-subtree-split: 804bb2af6d1be4086252c46bf15f3c75a5d9e931 --- src/MMseqsBase.cpp | 2 +- src/commons/Parameters.cpp | 5 +++ src/commons/Parameters.h | 1 + src/commons/Sequence.cpp | 2 ++ src/commons/SubstitutionMatrix.cpp | 23 +++++++------ src/commons/SubstitutionMatrix.h | 2 +- src/prefiltering/Prefiltering.cpp | 2 +- src/prefiltering/QueryMatcherTaxonomyHook.h | 27 +++++++++++---- src/prefiltering/ungappedprefilter.cpp | 6 ++-- src/test/TestDiagonalScoring.cpp | 37 +++++++++++++-------- src/test/TestDiagonalScoringPerformance.cpp | 14 ++++---- src/util/expandaln.cpp | 2 +- 12 files changed, 78 insertions(+), 45 deletions(-) diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index 02912545..282e740d 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -562,7 +562,7 @@ std::vector baseCommands = { {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb }, {"pvalDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb }, {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}}, - {"mergeresultsbyset", mergeresultsbyset, &par.threadsandcompression,COMMAND_MULTIHIT, + {"mergeresultsbyset", mergeresultsbyset, &par.mergeresultsbyset, COMMAND_MULTIHIT, "Merge results from multiple ORFs back to their respective contig", NULL, "Ruoshi Zhang, Clovis Norroy & Milot Mirdita ", diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index 41fad178..c9d0a169 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -892,6 +892,11 @@ Parameters::Parameters(): combinepvalbyset.push_back(&PARAM_COMPRESSED); combinepvalbyset.push_back(&PARAM_V); + // mergeresultsbyset + mergeresultsbyset.push_back(&PARAM_PRELOAD_MODE); + mergeresultsbyset.push_back(&PARAM_THREADS); + mergeresultsbyset.push_back(&PARAM_COMPRESSED); + mergeresultsbyset.push_back(&PARAM_V); // offsetalignment offsetalignment.push_back(&PARAM_CHAIN_ALIGNMENT); diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index 6db7ddbf..745d2f80 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -1166,6 +1166,7 @@ class Parameters { std::vector profile2seq; std::vector besthitbyset; std::vector combinepvalbyset; + std::vector mergeresultsbyset; std::vector multihitdb; std::vector multihitsearch; std::vector expandaln; diff --git a/src/commons/Sequence.cpp b/src/commons/Sequence.cpp index 8bbdbb9a..6461efcd 100644 --- a/src/commons/Sequence.cpp +++ b/src/commons/Sequence.cpp @@ -346,6 +346,8 @@ void Sequence::printProfile() const { } #ifdef GAP_POS_SCORING printf("%3d %3d %3d\n", gDel[i] & 0xF, gDel[i] >> 4, gIns[i]); +else + printf("\n"); #endif } } diff --git a/src/commons/SubstitutionMatrix.cpp b/src/commons/SubstitutionMatrix.cpp index c9d31df6..f15bc998 100644 --- a/src/commons/SubstitutionMatrix.cpp +++ b/src/commons/SubstitutionMatrix.cpp @@ -166,7 +166,7 @@ void SubstitutionMatrix::calcProfileProfileLocalAaBiasCorrection(short *profileS } void SubstitutionMatrix::calcProfileProfileLocalAaBiasCorrectionAln(int8_t *profileScores, - int N, size_t alphabetSize, BaseMatrix *subMat) { + unsigned int N, size_t alphabetSize, BaseMatrix *subMat) { const int windowSize = 40; @@ -176,32 +176,33 @@ void SubstitutionMatrix::calcProfileProfileLocalAaBiasCorrectionAln(int8_t *prof ProfileStates ps(alphabetSize,subMat->pBack); - for (int pos = 0; pos < N; pos++) { + for (unsigned int pos = 0; pos < N; pos++) { for(size_t aa = 0; aa < alphabetSize; aa++) { pnul[pos] += *(profileScores + pos + N*aa) * ps.prior[aa]; } } - for (int i = 0; i < N; i++){ - const int minPos = std::max(0, (i - windowSize/2)); - const int maxPos = std::min(N, (i + windowSize/2)); + for (unsigned int i = 0; i < N; i++){ + const int minPos = std::max(0, ((int)i - windowSize/2)); + const unsigned int maxPos = std::min(N, (i + windowSize/2)); const int windowLength = maxPos - minPos; // negative score for the amino acids in the neighborhood of i memset(aaSum, 0, sizeof(float) * alphabetSize); - for (int j = minPos; j < maxPos; j++){ - if( i == j ) + for (unsigned int j = minPos; j < maxPos; j++){ + if (i == j) { continue; - for(size_t aa = 0; aa < alphabetSize; aa++){ + } + for (size_t aa = 0; aa < alphabetSize; aa++) { aaSum[aa] += *(profileScores + aa*N + j) - pnul[j]; } } - for(size_t aa = 0; aa < alphabetSize; aa++) { + for (size_t aa = 0; aa < alphabetSize; aa++) { profileScores[i + aa*N] = static_cast(*(profileScores + i + N*aa) - aaSum[aa]/windowLength); } } - delete [] aaSum; - delete [] pnul; + delete[] aaSum; + delete[] pnul; } diff --git a/src/commons/SubstitutionMatrix.h b/src/commons/SubstitutionMatrix.h index cb705a6d..16c66d96 100644 --- a/src/commons/SubstitutionMatrix.h +++ b/src/commons/SubstitutionMatrix.h @@ -28,7 +28,7 @@ class SubstitutionMatrix : public BaseMatrix { const int N, size_t alphabetSize); static void calcProfileProfileLocalAaBiasCorrectionAln(int8_t *profileScores, - int N, + unsigned int N, size_t alphabetSize, BaseMatrix *subMat); static void calcGlobalAaBiasCorrection(const BaseMatrix * m, diff --git a/src/prefiltering/Prefiltering.cpp b/src/prefiltering/Prefiltering.cpp index 9efabdb3..14e8992a 100644 --- a/src/prefiltering/Prefiltering.cpp +++ b/src/prefiltering/Prefiltering.cpp @@ -226,7 +226,7 @@ Prefiltering::Prefiltering(const std::string &queryDB, if (par.taxonList.length() > 0) { - taxonomyHook = new QueryMatcherTaxonomyHook(targetDB, tdbr, par.taxonList); + taxonomyHook = new QueryMatcherTaxonomyHook(targetDB, tdbr, par.taxonList, par.threads); } else { taxonomyHook = NULL; } diff --git a/src/prefiltering/QueryMatcherTaxonomyHook.h b/src/prefiltering/QueryMatcherTaxonomyHook.h index c8a86b00..de32c332 100644 --- a/src/prefiltering/QueryMatcherTaxonomyHook.h +++ b/src/prefiltering/QueryMatcherTaxonomyHook.h @@ -7,20 +7,30 @@ #include "DBReader.h" #include "TaxonomyExpression.h" +#ifdef OPENMP +#include +#endif + class QueryMatcherTaxonomyHook : public QueryMatcherHook { public: - QueryMatcherTaxonomyHook(std::string targetPath, DBReader* targetReader, const std::string& expressionString) - : targetReader(targetReader), dbFrom(0) { + QueryMatcherTaxonomyHook(std::string targetPath, DBReader* targetReader, const std::string& expressionString, unsigned int threads) + : targetReader(targetReader), dbFrom(0), threads(threads) { std::string targetName = dbPathWithoutIndex(targetPath); taxonomy = NcbiTaxonomy::openTaxonomy(targetName); taxonomyMapping = new MappingReader(targetName); - expression = new TaxonomyExpression(expressionString, *taxonomy); + expression = new TaxonomyExpression*[threads]; + for (unsigned int i = 0; i < threads; i++) { + expression[i] = new TaxonomyExpression(expressionString, *taxonomy); + } } ~QueryMatcherTaxonomyHook() { delete taxonomy; delete taxonomyMapping; - delete expression; + for (unsigned int i = 0; i < threads; i++) { + delete expression[i]; + } + delete[] expression; } void setDbFrom(unsigned int from) { @@ -28,12 +38,16 @@ class QueryMatcherTaxonomyHook : public QueryMatcherHook { } size_t afterDiagonalMatchingHook(QueryMatcher& matcher, size_t resultSize) { + unsigned int thread_idx = 0; +#ifdef OPENMP + thread_idx = (unsigned int) omp_get_thread_num(); +#endif size_t writePos = 0; for (size_t i = 0; i < resultSize; i++) { unsigned int currId = matcher.foundDiagonals[i].id; unsigned int key = targetReader->getDbKey(dbFrom + currId); TaxID currTax = taxonomyMapping->lookup(key); - if (expression->isAncestor(currTax)) { + if (expression[thread_idx]->isAncestor(currTax)) { if (i != writePos) { matcher.foundDiagonals[writePos] = matcher.foundDiagonals[i]; } @@ -63,9 +77,10 @@ class QueryMatcherTaxonomyHook : public QueryMatcherHook { NcbiTaxonomy* taxonomy; MappingReader* taxonomyMapping; DBReader* targetReader; - TaxonomyExpression* expression; + TaxonomyExpression** expression; unsigned int dbFrom; + unsigned int threads; }; #endif \ No newline at end of file diff --git a/src/prefiltering/ungappedprefilter.cpp b/src/prefiltering/ungappedprefilter.cpp index 09e2f516..84bc0a96 100644 --- a/src/prefiltering/ungappedprefilter.cpp +++ b/src/prefiltering/ungappedprefilter.cpp @@ -77,8 +77,8 @@ int prefilterInternal(int argc, const char **argv, const Command &command, int m QueryMatcherTaxonomyHook * taxonomyHook = NULL; - if(par.PARAM_TAXON_LIST.wasSet){ - taxonomyHook = new QueryMatcherTaxonomyHook(par.db2, tdbr, par.taxonList); + if (par.PARAM_TAXON_LIST.wasSet) { + taxonomyHook = new QueryMatcherTaxonomyHook(par.db2, tdbr, par.taxonList, par.threads); } Debug::Progress progress(qdbr->getSize()); @@ -121,7 +121,7 @@ int prefilterInternal(int argc, const char **argv, const Command &command, int m unsigned int targetKey = tdbr->getDbKey(tId); if(taxonomyHook != NULL){ TaxID currTax = taxonomyHook->taxonomyMapping->lookup(targetKey); - if (taxonomyHook->expression->isAncestor(currTax) == false) { + if (taxonomyHook->expression[thread_idx]->isAncestor(currTax) == false) { continue; } } diff --git a/src/test/TestDiagonalScoring.cpp b/src/test/TestDiagonalScoring.cpp index d0eef85b..16407405 100644 --- a/src/test/TestDiagonalScoring.cpp +++ b/src/test/TestDiagonalScoring.cpp @@ -107,11 +107,12 @@ int main (int, const char**) { // std::cout << compositionBias[i] << " "; // } // std::cout << std::endl; +// matcher.createProfile(&s5, compositionBias); // for(size_t i = 0; i < s6.L; i++){ // hits[0].id = s6.getId(); // hits[0].diagonal = 0; // hits[0].count = 0; -// matcher.processQuery(&s5, compositionBias, hits, 1, 0); +// matcher.align(hits, 1, 0); // std::cout << hits[0].diagonal << " " << (int)hits[0].count << std::endl; // } @@ -121,75 +122,83 @@ int main (int, const char**) { hits[0].id = s1.getId(); hits[0].diagonal = 0; - matcher.processQuery(&s1, compositionBias, hits, 1); + matcher.createProfile(&s1, compositionBias); + matcher.align(hits, 1); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; for(int i = 0; i < 16; i++){ hits[i].id = s1.getId(); hits[i].diagonal = 0; } - matcher.processQuery(&s1, compositionBias, hits, 16); + matcher.align(hits, 16); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; + matcher.createProfile(&s2, compositionBias); hits[0].id = s1.getId(); hits[0].diagonal = 9; - matcher.processQuery(&s2, compositionBias, hits, 1); + matcher.align(hits, 1); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; for(int i = 0; i < 16; i++){ hits[i].id = s1.getId(); hits[i].diagonal = 9; } - matcher.processQuery(&s2, compositionBias, hits, 16); + matcher.align(hits, 16); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; for(int i = 0; i < 16; i++){ hits[i].id = s2.getId(); hits[i].diagonal = -9; } - matcher.processQuery(&s1, compositionBias, hits, 16); + matcher.createProfile(&s1, compositionBias); + matcher.align(hits, 16); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; - matcher.processQuery(&s1, compositionBias, hits, 1); + matcher.align(hits, 1); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; for(int i = 0; i < 16; i++){ hits[i].id = s2.getId(); hits[i].diagonal = -9; } - matcher.processQuery(&s3, compositionBias, hits, 16); + matcher.createProfile(&s3, compositionBias); + matcher.align(hits, 16); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; - matcher.processQuery(&s3, compositionBias, hits, 1); + matcher.align(hits, 1); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; hits[0].id = s4.getId(); hits[0].diagonal = -256; - matcher.processQuery(&s1, compositionBias, hits, 1); + matcher.createProfile(&s1, compositionBias); + matcher.align(hits, 1); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; hits[0].id = s1.getId(); hits[0].diagonal = 256; - matcher.processQuery(&s4,compositionBias, hits, 1); + matcher.createProfile(&s4, compositionBias); + matcher.align(hits, 1); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; hits[0].id = s7.getId(); hits[0].diagonal = -512; - matcher.processQuery(&s1,compositionBias, hits, 16); + matcher.createProfile(&s1, compositionBias); + matcher.align(hits, 16); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; hits[0].id = s1.getId(); hits[0].diagonal = 512; - matcher.processQuery(&s7,compositionBias, hits, 16); + matcher.createProfile(&s7, compositionBias); + matcher.align(hits, 16); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; hits[0].id = s7.getId(); hits[0].diagonal = 0; - matcher.processQuery(&s7, compositionBias, hits, 16); + matcher.align(hits, 16); std::cout << ExtendedSubstitutionMatrix::calcScore(s1.numSequence, s1.numSequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].count << std::endl; delete [] compositionBias; diff --git a/src/test/TestDiagonalScoringPerformance.cpp b/src/test/TestDiagonalScoringPerformance.cpp index 5ef9baef..ed8d3869 100644 --- a/src/test/TestDiagonalScoringPerformance.cpp +++ b/src/test/TestDiagonalScoringPerformance.cpp @@ -92,17 +92,17 @@ int main (int, const char**) { SubstitutionMatrix::calcLocalAaBiasCorrection(&subMat, s1.numSequence, s1.L, compositionBias, 1.0); - - matcher.processQuery(&s1,compositionBias, hits, 16); + matcher.createProfile(&s1, compositionBias); + matcher.align(hits, 16); std::cout << (int)hits[0].count << " "; std::cout << (int)hits[1].count << " "; std::cout << (int)hits[2].count << " "; std::cout << (int)hits[3].count << std::endl; - matcher.processQuery(&s1, compositionBias, hits, 1); - matcher.processQuery(&s1, compositionBias, hits + 1, 1); - matcher.processQuery(&s1, compositionBias, hits + 2, 1); - matcher.processQuery(&s1, compositionBias, hits + 3, 1); + matcher.align(hits, 1); + matcher.align(hits + 1, 1); + matcher.align(hits + 2, 1); + matcher.align(hits + 3, 1); std::cout << (int)hits[0].count<< " "; std::cout << (int)hits[1].count<< " "; @@ -114,7 +114,7 @@ int main (int, const char**) { hits[j].diagonal = rand()%s1.L; } // std::reverse(hits, hits+1000); - matcher.processQuery(&s1, compositionBias, hits, 16000); + matcher.align(hits, 16000); } // std::cout << ExtendedSubstitutionMatrix::calcScore(s1.sequence, s1.sequence,s1.L, subMat.subMatrix) << " " << (int)hits[0].diagonalScore << std::endl; // std::cout << (int)hits[0].diagonalScore << std::endl; diff --git a/src/util/expandaln.cpp b/src/util/expandaln.cpp index ebda49de..cbb767cc 100644 --- a/src/util/expandaln.cpp +++ b/src/util/expandaln.cpp @@ -321,8 +321,8 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl continue; } } else { - size_t cSeqId = cReader->getId(cSeqKey); if (returnAlnRes == false || par.expansionMode == Parameters::EXPAND_RESCORE_BACKTRACE) { + size_t cSeqId = cReader->getId(cSeqKey); cSeq.mapSequence(cSeqId, cSeqKey, cReader->getData(cSeqId, thread_idx), cReader->getSeqLen(cSeqId)); } //rescoreResultByBacktrace(resultAc, aSeq, cSeq, subMat, compositionBias, par.gapOpen.values.aminoacid(), par.gapExtend.values.aminoacid()); From a2f62ee20ac7c214730e74525d85557771c81249 Mon Sep 17 00:00:00 2001 From: Milot Mirdita Date: Tue, 2 Apr 2024 22:30:54 +0900 Subject: [PATCH 2/3] pass correct parameter list to mergeresultsbyset --- data/structuresearch.sh | 2 +- src/workflow/StructureSearch.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/data/structuresearch.sh b/data/structuresearch.sh index 4e366c7c..ab00e550 100644 --- a/data/structuresearch.sh +++ b/data/structuresearch.sh @@ -65,7 +65,7 @@ if [ "$ALIGNMENT_ALGO" = "tmalign" ]; then if [ -n "${EXPAND}" ]; then if notExists "${TMP_PATH}/strualn_expanded.dbtype"; then # shellcheck disable=SC2086 - "$MMSEQS" mergeresultsbyset "${INTERMEDIATE}" "${TARGET_ALIGNMENT}_clu" "${TMP_PATH}/strualn_expanded" ${MERGERESULTBYSET_PAR} \ + "$MMSEQS" mergeresultsbyset "${INTERMEDIATE}" "${TARGET_ALIGNMENT}_clu" "${TMP_PATH}/strualn_expanded" ${MERGERESULTBYSET_PAR} \ || fail "Expand died" fi INTERMEDIATE="${TMP_PATH}/strualn_expanded" diff --git a/src/workflow/StructureSearch.cpp b/src/workflow/StructureSearch.cpp index 29f219b5..fb180b5e 100644 --- a/src/workflow/StructureSearch.cpp +++ b/src/workflow/StructureSearch.cpp @@ -216,7 +216,7 @@ int structuresearch(int argc, const char **argv, const Command &command) { EXIT(EXIT_FAILURE); } } - cmd.addVariable("MERGERESULTBYSET_PAR", par.createParameterString(par.threadsandcompression).c_str()); + cmd.addVariable("MERGERESULTBYSET_PAR", par.createParameterString(par.mergeresultsbyset).c_str()); cmd.addVariable("EXPAND", "1"); } std::string program = tmpDir + "/structuresearch.sh"; From b6dac8a54139632c9d251ebd3540df05fcd9bc54 Mon Sep 17 00:00:00 2001 From: Milot Mirdita Date: Thu, 11 Apr 2024 18:55:50 +0900 Subject: [PATCH 3/3] Fix wrong PDB output for large structures in convert2pdb --- src/strucclustutils/convert2pdb.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/strucclustutils/convert2pdb.cpp b/src/strucclustutils/convert2pdb.cpp index 64750581..73b40931 100644 --- a/src/strucclustutils/convert2pdb.cpp +++ b/src/strucclustutils/convert2pdb.cpp @@ -208,7 +208,7 @@ int convert2pdb(int argc, const char **argv, const Command& command) { aa = 'X'; } const char* aa3 = threeLetterLookup[(int)(aa - 'A')]; - fprintf(threadHandle, "ATOM % 5d CA %s %c% 4d% 12.3f% 8.3f% 8.3f\n", (int)(j + 1), aa3, chainName[0], int(j + 1), ca[j], ca[j + (1 * seqLen)], ca[j + (2 * seqLen)]); + fprintf(threadHandle, "ATOM %5d CA %s %c%4d %8.3f%8.3f%8.3f\n", (int)(j + 1), aa3, chainName[0], int(j + 1), ca[j], ca[j + (1 * seqLen)], ca[j + (2 * seqLen)]); } if (outputMode == LocalParameters::PDB_OUTPUT_MODE_MULTIMODEL) { fprintf(threadHandle, "ENDMDL\n");