Skip to content

Commit

Permalink
Rework pairaln to support different pairing modes. Add support for du…
Browse files Browse the repository at this point in the history
…mmy sequences to result2msa
  • Loading branch information
martin-steinegger committed May 28, 2023
1 parent 12ba202 commit 1514015
Show file tree
Hide file tree
Showing 5 changed files with 128 additions and 19 deletions.
3 changes: 2 additions & 1 deletion src/alignment/MultipleAlignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,9 @@ void MultipleAlignment::updateGapsInSequenceSet(char **msaSequence, size_t cente
unsigned int queryPos = result.qStartPos;
unsigned int targetPos = result.dbStartPos;
// HACK: score was 0 and sequence was rejected, so we fill in an empty gap sequence
// Needed for pairaln with dummy sequences
if(targetPos == UINT_MAX) {
Debug(Debug::WARNING) << "Edge sequence " << i << " was not aligned." << "\n";
//Debug(Debug::WARNING) << "Edge sequence " << i << " was not aligned." << "\n";
// fill up with gaps
for(size_t pos = 0; pos < centerSeqSize; pos++){
edgeSeqMSA[pos] = '-';
Expand Down
9 changes: 9 additions & 0 deletions src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,9 @@ Parameters::Parameters():
// aggregatetax
PARAM_MAJORITY(PARAM_MAJORITY_ID, "--majority", "Majority threshold", "minimal fraction of agreement among taxonomically assigned sequences of a set", typeid(float), (void *) &majorityThr, "^0(\\.[0-9]+)?|^1(\\.0+)?$"),
PARAM_VOTE_MODE(PARAM_VOTE_MODE_ID, "--vote-mode", "Vote mode", "Mode of assigning weights to compute majority. 0: uniform, 1: minus log E-value, 2: score", typeid(int), (void *) &voteMode, "^[0-2]{1}$"),
// pairaln
PARAM_PAIRING_DUMMY_MODE(PARAM_PAIRING_DUMMY_MODE_ID, "--pairing-dummy-mode", "Include dummy pairing", "0: dont include, 1: include - an entry that will cause result2msa to write a gap only line", typeid(int), (void *) &pairdummymode, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT),
PARAM_PAIRING_MODE(PARAM_PAIRING_MODE_ID, "--pairing-mode", "Pairing mode", "0: pair maximal per species, 1: pair only if all chains are covered per species", typeid(int), (void *) &pairmode, "^[0-1]{1}$", MMseqsParameter::COMMAND_EXPERT),
// taxonomyreport
PARAM_REPORT_MODE(PARAM_REPORT_MODE_ID, "--report-mode", "Report mode", "Taxonomy report mode 0: Kraken 1: Krona", typeid(int), (void *) &reportMode, "^[0-1]{1}$"),
// createtaxdb
Expand Down Expand Up @@ -1191,6 +1194,8 @@ Parameters::Parameters():
expand2profile.push_back(&PARAM_V);
pairaln.push_back(&PARAM_PRELOAD_MODE);
pairaln.push_back(&PARAM_PAIRING_DUMMY_MODE);
pairaln.push_back(&PARAM_PAIRING_MODE);
pairaln.push_back(&PARAM_COMPRESSED);
pairaln.push_back(&PARAM_THREADS);
pairaln.push_back(&PARAM_V);
Expand Down Expand Up @@ -2512,6 +2517,10 @@ void Parameters::setDefaults() {
majorityThr = 0.5;
voteMode = AGG_TAX_MINUS_LOG_EVAL;

// pairaln
pairdummymode = PAIRALN_DUMMY_MODE_OFF;
pairmode = PAIRALN_MODE_ALL_PER_SPECIES;

// taxonomyreport
reportMode = 0;

Expand Down
16 changes: 16 additions & 0 deletions src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,14 @@ class Parameters {
static const int AGG_TAX_MINUS_LOG_EVAL = 1;
static const int AGG_TAX_SCORE = 2;

// pairaln dummy mode
static const int PAIRALN_DUMMY_MODE_OFF = 0;
static const int PAIRALN_DUMMY_MODE_ON = 1;

// pairaln mode
static const int PAIRALN_MODE_ALL_PER_SPECIES = 0;
static const int PAIRALN_MODE_COVER_ALL_CHAINS = 1;

// taxonomy search strategy
static const int TAXONOMY_SINGLE_SEARCH = 1;
static const int TAXONOMY_2BLCA = 2;
Expand Down Expand Up @@ -644,6 +652,10 @@ class Parameters {
float majorityThr;
int voteMode;

// pairaln
int pairdummymode;
int pairmode;

// taxonomyreport
int reportMode;

Expand Down Expand Up @@ -983,6 +995,10 @@ class Parameters {
PARAMETER(PARAM_MAJORITY)
PARAMETER(PARAM_VOTE_MODE)

// pairaln
PARAMETER(PARAM_PAIRING_DUMMY_MODE)
PARAMETER(PARAM_PAIRING_MODE)

// taxonomyreport
PARAMETER(PARAM_REPORT_MODE)

Expand Down
55 changes: 45 additions & 10 deletions src/util/pairaln.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,23 +57,30 @@ int pairaln(int argc, const char **argv, const Command& command) {
std::vector<Matcher::result_t> result;
result.reserve(100000);
std::unordered_map<unsigned int, size_t> findPair;
std::vector<unsigned int> taxonToPair;
std::string output;
output.reserve(100000);
bool hasBacktrace = false;
#pragma omp for schedule(dynamic, 1)
for (size_t fileNumber = 0; fileNumber < fileToIds.size(); fileNumber++) {
for (size_t fileNumber = 0; fileNumber < fileToIds.size(); fileNumber++) {
char buffer[1024 + 32768 * 4];
findPair.clear();
taxonToPair.clear();
progress.updateProgress();

unsigned int minResultDbKey = UINT_MAX;
// find intersection between all proteins
for (size_t i = 0; i < fileToIds[fileNumber].size(); i++) {
result.clear();
size_t id = fileToIds[fileNumber][i];
Matcher::readAlignmentResults(result, alnDbr.getData(id, thread_idx), true);

for (size_t resIdx = 0; resIdx < result.size(); ++resIdx) {
hasBacktrace = result[resIdx].backtrace.size() > 0;
unsigned int taxon = mapping->lookup(result[resIdx].dbKey);
// we don't want to introduce a new field, reuse existing unused field here
result[resIdx].dbOrfStartPos = taxon;
minResultDbKey = std::min(result[resIdx].dbKey, minResultDbKey);
}
std::stable_sort(result.begin(), result.end(), compareByTaxId);
unsigned int prevTaxon = UINT_MAX;
Expand All @@ -92,6 +99,16 @@ int pairaln(int argc, const char **argv, const Command& command) {
}
}

// fill taxonToPair vector
std::unordered_map<unsigned int, size_t>::iterator it;
for (it = findPair.begin(); it != findPair.end(); ++it) {
int thresholdToPair = (par.pairmode == Parameters::PAIRALN_MODE_ALL_PER_SPECIES) ? 1 : fileToIds[fileNumber].size() - 1;
if (it->second > thresholdToPair) {
taxonToPair.emplace_back(it->first);
}
}
std::sort(taxonToPair.begin(), taxonToPair.end());

for (size_t i = 0; i < fileToIds[fileNumber].size(); i++) {
result.clear();
output.clear();
Expand All @@ -103,21 +120,39 @@ int pairaln(int argc, const char **argv, const Command& command) {
// we don't want to introduce a new field, reuse existing unused field here
result[resIdx].dbOrfStartPos = taxon;
}

// stable sort is required to assure that best hit is first per taxon
std::stable_sort(result.begin(), result.end(), compareByTaxId);
unsigned int prevTaxon = UINT_MAX;
for (size_t resIdx = 0; resIdx < result.size(); ++resIdx) {
unsigned int taxon = result[resIdx].dbOrfStartPos;
// found pair!
if (taxon != prevTaxon
&& findPair.find(taxon) != findPair.end()
&& findPair[taxon] == fileToIds[fileNumber].size()) {
bool hasBacktrace = result[resIdx].backtrace.size() > 0;
size_t len = Matcher::resultToBuffer(buffer, result[resIdx], hasBacktrace, false, false);
// iterate over taxonToPair
size_t resIdxStart = 0;
for(size_t taxonInList : taxonToPair) {
bool taxonFound = false;
for (size_t resIdx = resIdxStart; resIdx < result.size(); ++resIdx) {
unsigned int taxon = result[resIdx].dbOrfStartPos;
// check if this taxon has enough information to pair
if(taxonInList != taxon){
continue;
}
bool bestTaxonHit = (taxon != prevTaxon);
taxonFound = true;
if(bestTaxonHit){
size_t len = Matcher::resultToBuffer(buffer, result[resIdx], hasBacktrace, false, false);
output.append(buffer, len);
resIdxStart = resIdx + 1;
break;
}
prevTaxon = taxon;
}
if(taxonFound == false && par.pairdummymode == Parameters::PAIRALN_DUMMY_MODE_ON){ // par.addDummyPairedAlignment
// write an Matcher::result_t with UINT_MAX as dbKey
Matcher::result_t emptyResult(minResultDbKey, 1, 1, 0, 1, 0,
0, UINT_MAX, 0, 0, UINT_MAX, 0, 0, "1M");
size_t len = Matcher::resultToBuffer(buffer, emptyResult, hasBacktrace, false, false);
output.append(buffer, len);
}
prevTaxon = taxon;
}

resultWriter.writeData(output.c_str(), output.length(), alnDbr.getDbKey(id), thread_idx);
}
}
Expand Down
64 changes: 56 additions & 8 deletions src/util/result2msa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -324,9 +324,21 @@ int result2msa(int argc, const char **argv, const Command &command) {
header = targetHeaderReader->getData(id, thread_idx);
length = targetHeaderReader->getEntryLen(id) - 1;
}
bool isOnlyGap = true;
for (size_t pos = 0; pos < res.centerLength; pos++) {
char aa = res.msaSequence[i][pos];
if (aa != MultipleAlignment::GAP) {
isOnlyGap = false;
break;
}
}

result.append(1, '>');
result.append(header, length);
if(isOnlyGap) {
result.append("DUMMY\n");
}else{
result.append(header, length);
}
// need to allow insertion in the centerSequence
for (size_t pos = 0; pos < res.centerLength; pos++) {
char aa = res.msaSequence[i][pos];
Expand All @@ -353,13 +365,31 @@ int result2msa(int argc, const char **argv, const Command &command) {
continue;
}

char *header;
const char *header;
bool isOnlyGap = true;
for (size_t pos = 0; pos < res.centerLength; pos++) {
char aa = res.msaSequence[i][pos];
if (aa != MultipleAlignment::GAP) {
isOnlyGap = false;
break;
}
}


if (i == 0) {
header = centerSequenceHeader;
if(isOnlyGap) {
header = "DUMMY";
}else{
header = centerSequenceHeader;
}
} else {
unsigned int key = seqKeys[i - 1];
size_t id = targetHeaderReader->getId(key);
header = targetHeaderReader->getData(id, thread_idx);
if(isOnlyGap) {
header = "DUMMY";
}else {
unsigned int key = seqKeys[i - 1];
size_t id = targetHeaderReader->getId(key);
header = targetHeaderReader->getData(id, thread_idx);
}
}
accession = Util::parseFastaHeader(header);

Expand All @@ -386,12 +416,30 @@ int result2msa(int argc, const char **argv, const Command &command) {
}

result.push_back('>');
bool isOnlyGap = true;
for (size_t pos = 0; pos < res.centerLength; pos++) {
char aa = res.msaSequence[i][pos];
if (aa != MultipleAlignment::GAP) {
isOnlyGap = false;
break;
}
}


if (i == 0) {
result.append(Util::parseFastaHeader(centerSequenceHeader));
if(isOnlyGap){
result.append("DUMMY");
}else{
result.append(Util::parseFastaHeader(centerSequenceHeader));
}
} else {
unsigned int key = seqKeys[i - 1];
size_t id = targetHeaderReader->getId(key);
result.append(Util::parseFastaHeader(targetHeaderReader->getData(id, thread_idx)));
if(isOnlyGap){
result.append("DUMMY");
}else {
result.append(Util::parseFastaHeader(targetHeaderReader->getData(id, thread_idx)));
}
if (par.msaFormatMode == Parameters::FORMAT_MSA_A3M_ALN_INFO) {
size_t len = Matcher::resultToBuffer(buffer, alnResults[i - 1], false);
char* data = buffer;
Expand Down

0 comments on commit 1514015

Please sign in to comment.