diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index 663d6bfe8..538d3510e 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -104,7 +104,7 @@ Parameters::Parameters(): PARAM_FILTER_HITS(PARAM_FILTER_HITS_ID, "--filter-hits", "Remove hits by seq. id. and coverage", "Filter hits by seq.id. and coverage", typeid(bool), (void *) &filterHits, "", MMseqsParameter::COMMAND_EXPERT), PARAM_SORT_RESULTS(PARAM_SORT_RESULTS_ID, "--sort-results", "Sort results", "Sort results: 0: no sorting, 1: sort by E-value (Alignment) or seq.id. (Hamming)", typeid(int), (void *) &sortResults, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT), // result2msa - PARAM_MSA_FORMAT_MODE(PARAM_MSA_FORMAT_MODE_ID, "--msa-format-mode", "MSA format mode", "Format MSA as: 0: binary cA3M DB\n1: binary ca3m w. consensus DB\n2: aligned FASTA DB\n3: aligned FASTA w. header summary\n4: STOCKHOLM flat file\n5: A3M format", typeid(int), (void *) &msaFormatMode, "^[0-5]{1}$"), + PARAM_MSA_FORMAT_MODE(PARAM_MSA_FORMAT_MODE_ID, "--msa-format-mode", "MSA format mode", "Format MSA as: 0: binary cA3M DB\n1: binary ca3m w. consensus DB\n2: aligned FASTA DB\n3: aligned FASTA w. header summary\n4: STOCKHOLM flat file\n5: A3M format\n6: A3M format w. alignment info", typeid(int), (void *) &msaFormatMode, "^[0-5]{1}$"), PARAM_ALLOW_DELETION(PARAM_ALLOW_DELETION_ID, "--allow-deletion", "Allow deletions", "Allow deletions in a MSA", typeid(bool), (void *) &allowDeletion, ""), PARAM_SUMMARY_PREFIX(PARAM_SUMMARY_PREFIX_ID, "--summary-prefix", "Summary prefix", "Set the cluster summary prefix", typeid(std::string), (void *) &summaryPrefix, "", MMseqsParameter::COMMAND_EXPERT), PARAM_SKIP_QUERY(PARAM_SKIP_QUERY_ID, "--skip-query", "Skip query", "Skip the query sequence", typeid(bool), (void *) &skipQuery, "", MMseqsParameter::COMMAND_EXPERT), @@ -272,6 +272,7 @@ Parameters::Parameters(): PARAM_TAX_DB_MODE(PARAM_TAX_DB_MODE_ID, "--tax-db-mode", "Taxonomy db mode", "Create taxonomy database as: 0: .dmp flat files (human readable) 1: binary dump (faster readin)", typeid(int), (void *) &taxDbMode, "^[0-1]{1}$"), // expandaln PARAM_EXPANSION_MODE(PARAM_EXPANSION_MODE_ID, "--expansion-mode", "Expansion mode", "Update score, E-value, and sequence identity by 0: input alignment 1: rescoring the inferred backtrace", typeid(int), (void *) &expansionMode, "^[0-2]{1}$"), + PARAM_EXPAND_FILTER_CLUSTERS(PARAM_EXPAND_FILTER_CLUSTERS_ID, "--expand-filter-clusters", "Expand filter clusters", "Filter each target cluster during expansion 0: no filter 1: filter", typeid(int), (void *) &expandFilterClusters, "^[0-1]{1}$"), // taxonomy PARAM_LCA_MODE(PARAM_LCA_MODE_ID, "--lca-mode", "LCA mode", "LCA Mode 1: single search LCA , 2/3: approximate 2bLCA, 4: top hit", typeid(int), (void *) &taxonomySearchMode, "^[1-4]{1}$"), PARAM_TAX_OUTPUT_MODE(PARAM_TAX_OUTPUT_MODE_ID, "--tax-output-mode", "Taxonomy output mode", "0: output LCA, 1: output alignment 2: output both", typeid(int), (void *) &taxonomyOutputMode, "^[0-2]{1}$"), @@ -1084,6 +1085,13 @@ Parameters::Parameters(): expandaln.push_back(&PARAM_PC_MODE); expandaln.push_back(&PARAM_PCA); expandaln.push_back(&PARAM_PCB); + expandaln.push_back(&PARAM_EXPAND_FILTER_CLUSTERS); + expandaln.push_back(&PARAM_FILTER_MIN_ENABLE); + expandaln.push_back(&PARAM_FILTER_MAX_SEQ_ID); + expandaln.push_back(&PARAM_FILTER_QID); + expandaln.push_back(&PARAM_FILTER_QSC); + expandaln.push_back(&PARAM_FILTER_COV); + expandaln.push_back(&PARAM_FILTER_NDIFF); expandaln.push_back(&PARAM_PRELOAD_MODE); expandaln.push_back(&PARAM_COMPRESSED); expandaln.push_back(&PARAM_THREADS); @@ -1107,6 +1115,7 @@ Parameters::Parameters(): expand2profile.push_back(&PARAM_MASK_PROFILE); expand2profile.push_back(&PARAM_WG); expand2profile.push_back(&PARAM_ALLOW_DELETION); + expand2profile.push_back(&PARAM_EXPAND_FILTER_CLUSTERS); expand2profile.push_back(&PARAM_FILTER_MSA); expand2profile.push_back(&PARAM_FILTER_MIN_ENABLE); expand2profile.push_back(&PARAM_FILTER_MAX_SEQ_ID); @@ -2403,6 +2412,7 @@ void Parameters::setDefaults() { // expandaln expansionMode = EXPAND_TRANSFER_EVALUE; + expandFilterClusters = 0; // taxonomy taxonomySearchMode = Parameters::TAXONOMY_APPROX_2BLCA; diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index 00d432e18..4ba8e6cce 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -641,6 +641,7 @@ class Parameters { // exapandaln int expansionMode; + int expandFilterClusters; // taxonomy int taxonomySearchMode; @@ -971,6 +972,7 @@ class Parameters { // exapandaln PARAMETER(PARAM_EXPANSION_MODE) + PARAMETER(PARAM_EXPAND_FILTER_CLUSTERS) // taxonomy PARAMETER(PARAM_LCA_MODE) diff --git a/src/util/expandaln.cpp b/src/util/expandaln.cpp index 686d00db8..ff1bebd40 100644 --- a/src/util/expandaln.cpp +++ b/src/util/expandaln.cpp @@ -152,6 +152,7 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl BacktraceTranslator translator; SubstitutionMatrix subMat(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, par.scoreBias); + const bool filterBc = par.expandFilterClusters; EvalueComputation *evaluer = NULL; ProbabilityMatrix *probMatrix = NULL; if (returnAlnRes == false) { @@ -169,24 +170,33 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl Sequence aSeq(par.maxSeqLen, aSeqDbType, &subMat, 0, false, par.compBiasCorrection); Sequence cSeq(par.maxSeqLen, cSeqDbType, &subMat, 0, false, false); + Sequence* bSeq = NULL; + if (filterBc) { + bSeq = new Sequence(par.maxSeqLen, cSeqDbType, &subMat, 0, false, false); + } MultipleAlignment *aligner = NULL; MsaFilter *filter = NULL; PSSMCalculator *calculator = NULL; PSSMMasker *masker = NULL; std::vector> seqSet; + std::vector> subSeqSet; std::string result; - if (returnAlnRes == false) { + if (returnAlnRes == false || filterBc) { aligner = new MultipleAlignment(par.maxSeqLen, &subMat); - if (par.filterMsa) { + if (par.filterMsa || filterBc) { filter = new MsaFilter(par.maxSeqLen, 300, &subMat, par.gapOpen.values.aminoacid(), par.gapExtend.values.aminoacid()); } + subSeqSet.reserve(300); + } + + if (returnAlnRes == false) { // TODO: is this right? calculator = new PSSMCalculator(&subMat, par.maxSeqLen, 300, par.pcmode, par.pca, par.pcb, par.gapOpen.values.aminoacid(), par.gapPseudoCount); masker = new PSSMMasker(par.maxSeqLen, *probMatrix, subMat); - seqSet.reserve(300); result.reserve(par.maxSeqLen * Sequence::PROFILE_READIN_SIZE); + seqSet.reserve(300); } size_t compBufferSize = (par.maxSeqLen + 1) * sizeof(float); @@ -249,6 +259,32 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl // CompressedA3M::extractMatcherResults(key, resultsBc, resultBcReader.getData(bResId, thread_idx), resultBcReader.getEntryLen(bResId), *cReader, false); Matcher::readAlignmentResults(resultsBc, resultBcReader->getData(bResId, thread_idx), false); + if (filterBc) { + for (size_t k = 0; k < resultsBc.size(); ++k) { + Matcher::result_t &resultBc = resultsBc[k]; + if (resultBc.backtrace.size() == 0) { + Debug(Debug::ERROR) << "Alignment must contain a backtrace\n"; + EXIT(EXIT_FAILURE); + } + if (k == 0) { + unsigned int bSeqKey = resultAb.dbKey; + size_t bSeqId = cReader->getId(bSeqKey); + bSeq->mapSequence(bSeqId, bSeqKey, cReader->getData(bSeqId, thread_idx), cReader->getSeqLen(bSeqId)); + } else { + unsigned int cSeqKey = resultBc.dbKey; + size_t cSeqId = cReader->getId(cSeqKey); + cSeq.mapSequence(cSeqId, cSeqKey, cReader->getData(cSeqId, thread_idx), cReader->getSeqLen(cSeqId)); + subSeqSet.emplace_back(cSeq.numSequence, cSeq.numSequence + cSeq.L); + } + } + Matcher::result_t query = *(resultsBc.begin()); + resultsBc.erase(resultsBc.begin()); + MultipleAlignment::MSAResult res = aligner->computeMSA(bSeq, subSeqSet, resultsBc, true); + filter->filter(res, resultsBc, (int)(par.covMSAThr * 100), qid_vec, par.qsc, (int)(par.filterMaxSeqId * 100), par.Ndiff, par.filterMinEnable); + resultsBc.insert(resultsBc.begin(), query); + MultipleAlignment::deleteMSA(&res); + subSeqSet.clear(); + } //std::stable_sort(resultsBc.begin(), resultsBc.end(), compareHitsByKeyScore); for (size_t k = 0; k < resultsBc.size(); ++k) { @@ -362,11 +398,13 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl if (compositionBias != NULL) { free(compositionBias); } - if (returnAlnRes == false) { + if (aligner != NULL) { delete aligner; - if (filter != NULL) { - delete filter; - } + } + if (filter != NULL) { + delete filter; + } + if (returnAlnRes == false) { delete calculator; delete masker; } @@ -374,6 +412,9 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl delete intervalBuffer.top(); intervalBuffer.pop(); } + if (bSeq != NULL) { + delete bSeq; + } } writer.close(returnAlnRes == false);