Skip to content

Commit

Permalink
Expand can filter in each target cluster before expanding
Browse files Browse the repository at this point in the history
  • Loading branch information
milot-mirdita committed Aug 19, 2021
1 parent ae4c7ab commit 85ce847
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 8 deletions.
12 changes: 11 additions & 1 deletion src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ Parameters::Parameters():
PARAM_FILTER_HITS(PARAM_FILTER_HITS_ID, "--filter-hits", "Remove hits by seq. id. and coverage", "Filter hits by seq.id. and coverage", typeid(bool), (void *) &filterHits, "", MMseqsParameter::COMMAND_EXPERT),
PARAM_SORT_RESULTS(PARAM_SORT_RESULTS_ID, "--sort-results", "Sort results", "Sort results: 0: no sorting, 1: sort by E-value (Alignment) or seq.id. (Hamming)", typeid(int), (void *) &sortResults, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT),
// result2msa
PARAM_MSA_FORMAT_MODE(PARAM_MSA_FORMAT_MODE_ID, "--msa-format-mode", "MSA format mode", "Format MSA as: 0: binary cA3M DB\n1: binary ca3m w. consensus DB\n2: aligned FASTA DB\n3: aligned FASTA w. header summary\n4: STOCKHOLM flat file\n5: A3M format", typeid(int), (void *) &msaFormatMode, "^[0-5]{1}$"),
PARAM_MSA_FORMAT_MODE(PARAM_MSA_FORMAT_MODE_ID, "--msa-format-mode", "MSA format mode", "Format MSA as: 0: binary cA3M DB\n1: binary ca3m w. consensus DB\n2: aligned FASTA DB\n3: aligned FASTA w. header summary\n4: STOCKHOLM flat file\n5: A3M format\n6: A3M format w. alignment info", typeid(int), (void *) &msaFormatMode, "^[0-5]{1}$"),
PARAM_ALLOW_DELETION(PARAM_ALLOW_DELETION_ID, "--allow-deletion", "Allow deletions", "Allow deletions in a MSA", typeid(bool), (void *) &allowDeletion, ""),
PARAM_SUMMARY_PREFIX(PARAM_SUMMARY_PREFIX_ID, "--summary-prefix", "Summary prefix", "Set the cluster summary prefix", typeid(std::string), (void *) &summaryPrefix, "", MMseqsParameter::COMMAND_EXPERT),
PARAM_SKIP_QUERY(PARAM_SKIP_QUERY_ID, "--skip-query", "Skip query", "Skip the query sequence", typeid(bool), (void *) &skipQuery, "", MMseqsParameter::COMMAND_EXPERT),
Expand Down Expand Up @@ -272,6 +272,7 @@ Parameters::Parameters():
PARAM_TAX_DB_MODE(PARAM_TAX_DB_MODE_ID, "--tax-db-mode", "Taxonomy db mode", "Create taxonomy database as: 0: .dmp flat files (human readable) 1: binary dump (faster readin)", typeid(int), (void *) &taxDbMode, "^[0-1]{1}$"),
// expandaln
PARAM_EXPANSION_MODE(PARAM_EXPANSION_MODE_ID, "--expansion-mode", "Expansion mode", "Update score, E-value, and sequence identity by 0: input alignment 1: rescoring the inferred backtrace", typeid(int), (void *) &expansionMode, "^[0-2]{1}$"),
PARAM_EXPAND_FILTER_CLUSTERS(PARAM_EXPAND_FILTER_CLUSTERS_ID, "--expand-filter-clusters", "Expand filter clusters", "Filter each target cluster during expansion 0: no filter 1: filter", typeid(int), (void *) &expandFilterClusters, "^[0-1]{1}$"),
// taxonomy
PARAM_LCA_MODE(PARAM_LCA_MODE_ID, "--lca-mode", "LCA mode", "LCA Mode 1: single search LCA , 2/3: approximate 2bLCA, 4: top hit", typeid(int), (void *) &taxonomySearchMode, "^[1-4]{1}$"),
PARAM_TAX_OUTPUT_MODE(PARAM_TAX_OUTPUT_MODE_ID, "--tax-output-mode", "Taxonomy output mode", "0: output LCA, 1: output alignment 2: output both", typeid(int), (void *) &taxonomyOutputMode, "^[0-2]{1}$"),
Expand Down Expand Up @@ -1084,6 +1085,13 @@ Parameters::Parameters():
expandaln.push_back(&PARAM_PC_MODE);
expandaln.push_back(&PARAM_PCA);
expandaln.push_back(&PARAM_PCB);
expandaln.push_back(&PARAM_EXPAND_FILTER_CLUSTERS);
expandaln.push_back(&PARAM_FILTER_MIN_ENABLE);
expandaln.push_back(&PARAM_FILTER_MAX_SEQ_ID);
expandaln.push_back(&PARAM_FILTER_QID);
expandaln.push_back(&PARAM_FILTER_QSC);
expandaln.push_back(&PARAM_FILTER_COV);
expandaln.push_back(&PARAM_FILTER_NDIFF);
expandaln.push_back(&PARAM_PRELOAD_MODE);
expandaln.push_back(&PARAM_COMPRESSED);
expandaln.push_back(&PARAM_THREADS);
Expand All @@ -1107,6 +1115,7 @@ Parameters::Parameters():
expand2profile.push_back(&PARAM_MASK_PROFILE);
expand2profile.push_back(&PARAM_WG);
expand2profile.push_back(&PARAM_ALLOW_DELETION);
expand2profile.push_back(&PARAM_EXPAND_FILTER_CLUSTERS);
expand2profile.push_back(&PARAM_FILTER_MSA);
expand2profile.push_back(&PARAM_FILTER_MIN_ENABLE);
expand2profile.push_back(&PARAM_FILTER_MAX_SEQ_ID);
Expand Down Expand Up @@ -2403,6 +2412,7 @@ void Parameters::setDefaults() {

// expandaln
expansionMode = EXPAND_TRANSFER_EVALUE;
expandFilterClusters = 0;

// taxonomy
taxonomySearchMode = Parameters::TAXONOMY_APPROX_2BLCA;
Expand Down
2 changes: 2 additions & 0 deletions src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -641,6 +641,7 @@ class Parameters {

// exapandaln
int expansionMode;
int expandFilterClusters;

// taxonomy
int taxonomySearchMode;
Expand Down Expand Up @@ -971,6 +972,7 @@ class Parameters {

// exapandaln
PARAMETER(PARAM_EXPANSION_MODE)
PARAMETER(PARAM_EXPAND_FILTER_CLUSTERS)

// taxonomy
PARAMETER(PARAM_LCA_MODE)
Expand Down
55 changes: 48 additions & 7 deletions src/util/expandaln.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl
BacktraceTranslator translator;
SubstitutionMatrix subMat(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0, par.scoreBias);

const bool filterBc = par.expandFilterClusters;
EvalueComputation *evaluer = NULL;
ProbabilityMatrix *probMatrix = NULL;
if (returnAlnRes == false) {
Expand All @@ -169,24 +170,33 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl

Sequence aSeq(par.maxSeqLen, aSeqDbType, &subMat, 0, false, par.compBiasCorrection);
Sequence cSeq(par.maxSeqLen, cSeqDbType, &subMat, 0, false, false);
Sequence* bSeq = NULL;
if (filterBc) {
bSeq = new Sequence(par.maxSeqLen, cSeqDbType, &subMat, 0, false, false);
}

MultipleAlignment *aligner = NULL;
MsaFilter *filter = NULL;
PSSMCalculator *calculator = NULL;
PSSMMasker *masker = NULL;
std::vector<std::vector<unsigned char>> seqSet;
std::vector<std::vector<unsigned char>> subSeqSet;
std::string result;

if (returnAlnRes == false) {
if (returnAlnRes == false || filterBc) {
aligner = new MultipleAlignment(par.maxSeqLen, &subMat);
if (par.filterMsa) {
if (par.filterMsa || filterBc) {
filter = new MsaFilter(par.maxSeqLen, 300, &subMat, par.gapOpen.values.aminoacid(), par.gapExtend.values.aminoacid());
}
subSeqSet.reserve(300);
}

if (returnAlnRes == false) {
// TODO: is this right?
calculator = new PSSMCalculator(&subMat, par.maxSeqLen, 300, par.pcmode, par.pca, par.pcb, par.gapOpen.values.aminoacid(), par.gapPseudoCount);
masker = new PSSMMasker(par.maxSeqLen, *probMatrix, subMat);
seqSet.reserve(300);
result.reserve(par.maxSeqLen * Sequence::PROFILE_READIN_SIZE);
seqSet.reserve(300);
}

size_t compBufferSize = (par.maxSeqLen + 1) * sizeof(float);
Expand Down Expand Up @@ -249,6 +259,32 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl
// CompressedA3M::extractMatcherResults(key, resultsBc, resultBcReader.getData(bResId, thread_idx), resultBcReader.getEntryLen(bResId), *cReader, false);
Matcher::readAlignmentResults(resultsBc, resultBcReader->getData(bResId, thread_idx), false);

if (filterBc) {
for (size_t k = 0; k < resultsBc.size(); ++k) {
Matcher::result_t &resultBc = resultsBc[k];
if (resultBc.backtrace.size() == 0) {
Debug(Debug::ERROR) << "Alignment must contain a backtrace\n";
EXIT(EXIT_FAILURE);
}
if (k == 0) {
unsigned int bSeqKey = resultAb.dbKey;
size_t bSeqId = cReader->getId(bSeqKey);
bSeq->mapSequence(bSeqId, bSeqKey, cReader->getData(bSeqId, thread_idx), cReader->getSeqLen(bSeqId));
} else {
unsigned int cSeqKey = resultBc.dbKey;
size_t cSeqId = cReader->getId(cSeqKey);
cSeq.mapSequence(cSeqId, cSeqKey, cReader->getData(cSeqId, thread_idx), cReader->getSeqLen(cSeqId));
subSeqSet.emplace_back(cSeq.numSequence, cSeq.numSequence + cSeq.L);
}
}
Matcher::result_t query = *(resultsBc.begin());
resultsBc.erase(resultsBc.begin());
MultipleAlignment::MSAResult res = aligner->computeMSA(bSeq, subSeqSet, resultsBc, true);
filter->filter(res, resultsBc, (int)(par.covMSAThr * 100), qid_vec, par.qsc, (int)(par.filterMaxSeqId * 100), par.Ndiff, par.filterMinEnable);
resultsBc.insert(resultsBc.begin(), query);
MultipleAlignment::deleteMSA(&res);
subSeqSet.clear();
}
//std::stable_sort(resultsBc.begin(), resultsBc.end(), compareHitsByKeyScore);

for (size_t k = 0; k < resultsBc.size(); ++k) {
Expand Down Expand Up @@ -362,18 +398,23 @@ int expandaln(int argc, const char **argv, const Command& command, bool returnAl
if (compositionBias != NULL) {
free(compositionBias);
}
if (returnAlnRes == false) {
if (aligner != NULL) {
delete aligner;
if (filter != NULL) {
delete filter;
}
}
if (filter != NULL) {
delete filter;
}
if (returnAlnRes == false) {
delete calculator;
delete masker;
}
while(intervalBuffer.size()){
delete intervalBuffer.top();
intervalBuffer.pop();
}
if (bSeq != NULL) {
delete bSeq;
}
}

writer.close(returnAlnRes == false);
Expand Down

0 comments on commit 85ce847

Please sign in to comment.