Skip to content

Commit

Permalink
Add target similiar k-mer search to prefiler
Browse files Browse the repository at this point in the history
  • Loading branch information
martin-steinegger committed May 28, 2023
1 parent 390457d commit 71dd32e
Show file tree
Hide file tree
Showing 10 changed files with 94 additions and 48 deletions.
3 changes: 3 additions & 0 deletions src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ Parameters::Parameters():
alphabetSize(NuclAA<int>(INT_MAX,INT_MAX)),
PARAM_S(PARAM_S_ID, "-s", "Sensitivity", "Sensitivity: 1.0 faster; 4.0 fast; 7.5 sensitive", typeid(float), (void *) &sensitivity, "^[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_PREFILTER),
PARAM_K(PARAM_K_ID, "-k", "k-mer length", "k-mer length (0: automatically set to optimum)", typeid(int), (void *) &kmerSize, "^[0-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT),
PARAM_TARGET_SEARCH_MODE(PARAM_TARGET_SEARCH_MODE_ID, "--target-search-mode", "Target search mode", "target search mode (0: regular k-mer, 1: similar k-mer)", typeid(int), (void *) &targetSearchMode, "^[0-1]{1}$", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT),
PARAM_THREADS(PARAM_THREADS_ID, "--threads", "Threads", "Number of CPU-cores used (all by default)", typeid(int), (void *) &threads, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_COMMON),
PARAM_COMPRESSED(PARAM_COMPRESSED_ID, "--compressed", "Compressed", "Write compressed output", typeid(int), (void *) &compressed, "^[0-1]{1}$", MMseqsParameter::COMMAND_COMMON),
PARAM_ALPH_SIZE(PARAM_ALPH_SIZE_ID, "--alph-size", "Alphabet size", "Alphabet size (range 2-21)", typeid(MultiParam<NuclAA<int>>), (void *) &alphabetSize, "", MMseqsParameter::COMMAND_PREFILTER | MMseqsParameter::COMMAND_CLUSTLINEAR | MMseqsParameter::COMMAND_EXPERT),
Expand Down Expand Up @@ -392,6 +393,7 @@ Parameters::Parameters():
prefilter.push_back(&PARAM_SEED_SUB_MAT);
prefilter.push_back(&PARAM_S);
prefilter.push_back(&PARAM_K);
prefilter.push_back(&PARAM_TARGET_SEARCH_MODE);
prefilter.push_back(&PARAM_K_SCORE);
prefilter.push_back(&PARAM_ALPH_SIZE);
prefilter.push_back(&PARAM_MAX_SEQ_LEN);
Expand Down Expand Up @@ -2211,6 +2213,7 @@ void Parameters::setDefaults() {
seedScoringMatrixFile = MultiParam<NuclAA<std::string>>(NuclAA<std::string>("VTML80.out", "nucleotide.out"));

kmerSize = 0;
targetSearchMode = 0;
kmerScore.values = INT_MAX;
alphabetSize = MultiParam<NuclAA<int>>(NuclAA<int>(21,5));
maxSeqLen = MAX_SEQ_LEN; // 2^16
Expand Down
2 changes: 2 additions & 0 deletions src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,7 @@ class Parameters {
// PREFILTER
float sensitivity; // target sens
int kmerSize; // kmer size for the prefilter
int targetSearchMode; // target search mode
MultiParam<SeqProf<int>> kmerScore; // kmer score for the prefilter
MultiParam<NuclAA<int>> alphabetSize; // alphabet size for the prefilter
int compBiasCorrection; // Aminoacid composiont correction
Expand Down Expand Up @@ -716,6 +717,7 @@ class Parameters {

PARAMETER(PARAM_S)
PARAMETER(PARAM_K)
PARAMETER(PARAM_TARGET_SEARCH_MODE)
PARAMETER(PARAM_THREADS)
PARAMETER(PARAM_COMPRESSED)
PARAMETER(PARAM_ALPH_SIZE)
Expand Down
35 changes: 25 additions & 10 deletions src/prefiltering/IndexBuilder.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "IndexBuilder.h"
#include "tantan.h"
#include "ExtendedSubstitutionMatrix.h"

#ifdef OPENMP
#include <omp.h>
Expand Down Expand Up @@ -51,13 +52,14 @@ class DbInfo {


void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedLookup,
SequenceLookup **unmaskedLookup,BaseMatrix &subMat, Sequence *seq,
SequenceLookup **unmaskedLookup,BaseMatrix &subMat,
ScoreMatrix & three, ScoreMatrix & two, Sequence *seq,
DBReader<unsigned int> *dbr, size_t dbFrom, size_t dbTo, int kmerThr,
bool mask, bool maskLowerCaseMode, float maskProb) {
bool mask, bool maskLowerCaseMode, float maskProb, int targetSearchMode) {
Debug(Debug::INFO) << "Index table: counting k-mers\n";

const bool isProfile = Parameters::isEqualDbtype(seq->getSeqType(), Parameters::DBTYPE_HMM_PROFILE);

const bool isTargetSimiliarKmerSearch = isProfile || targetSearchMode;
dbTo = std::min(dbTo, dbr->getSize());
size_t dbSize = dbTo - dbFrom;
DbInfo* info = new DbInfo(dbFrom, dbTo, seq->getEffectiveKmerSize(), *dbr);
Expand Down Expand Up @@ -101,9 +103,13 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
Sequence s(seq->getMaxLen(), seq->getSeqType(), &subMat, seq->getKmerSize(), seq->isSpaced(), false, true, seq->getUserSpacedKmerPattern());

KmerGenerator *generator = NULL;
if (isProfile) {
if (isTargetSimiliarKmerSearch) {
generator = new KmerGenerator(seq->getKmerSize(), indexTable->getAlphabetSize(), kmerThr);
generator->setDivideStrategy(s.profile_matrix);
if(isProfile){
generator->setDivideStrategy(s.profile_matrix);
}else{
generator->setDivideStrategy(&three, &two);
}
}

unsigned int *buffer = static_cast<unsigned int*>(malloc(seq->getMaxLen() * sizeof(unsigned int)));
Expand All @@ -122,10 +128,15 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
bufferSize = seq->getMaxLen();
}
// count similar or exact k-mers based on sequence type
if (isProfile) {
if (isTargetSimiliarKmerSearch) {
// Find out if we should also mask profiles
totalKmerCount += indexTable->addSimilarKmerCount(&s, generator);
(*unmaskedLookup)->addSequence(s.numConsensusSequence, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
unsigned char * seq = (isProfile) ? s.numConsensusSequence : s.numSequence;
if (unmaskedLookup != NULL) {
(*unmaskedLookup)->addSequence(seq, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
} else if (maskedLookup != NULL) {
(*maskedLookup)->addSequence(seq, s.L, id - dbFrom, info->sequenceOffsets[id - dbFrom]);
}
} else {
// Do not mask if column state sequences are used
if (unmaskedLookup != NULL) {
Expand Down Expand Up @@ -219,9 +230,13 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
IndexEntryLocalTmp *buffer = static_cast<IndexEntryLocalTmp *>(malloc( seq->getMaxLen() * sizeof(IndexEntryLocalTmp)));
size_t bufferSize = seq->getMaxLen();
KmerGenerator *generator = NULL;
if (isProfile) {
if (isTargetSimiliarKmerSearch) {
generator = new KmerGenerator(seq->getKmerSize(), indexTable->getAlphabetSize(), kmerThr);
generator->setDivideStrategy(s.profile_matrix);
if(isProfile){
generator->setDivideStrategy(s.profile_matrix);
}else{
generator->setDivideStrategy(&three, &two);
}
}

#pragma omp for schedule(dynamic, 100)
Expand All @@ -230,7 +245,7 @@ void IndexBuilder::fillDatabase(IndexTable *indexTable, SequenceLookup **maskedL
progress2.updateProgress();

unsigned int qKey = dbr->getDbKey(id);
if (isProfile) {
if (isTargetSimiliarKmerSearch) {
s.mapSequence(id - dbFrom, qKey, dbr->getData(id, thread_idx), dbr->getSeqLen(id));
indexTable->addSimilarSequence(&s, generator, &buffer, bufferSize, &idxer);
} else {
Expand Down
7 changes: 5 additions & 2 deletions src/prefiltering/IndexBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@
#define MMSEQS_INDEXBUILDER_H

#include "IndexTable.h"
#include "ExtendedSubstitutionMatrix.h"

class IndexBuilder {
public:
static void fillDatabase(IndexTable *indexTable, SequenceLookup **maskedLookup, SequenceLookup **unmaskedLookup,
BaseMatrix &subMat, Sequence *seq,
DBReader<unsigned int> *dbr, size_t dbFrom, size_t dbTo, int kmerThr, bool mask, bool maskLowerCaseMode, float maskProb);
BaseMatrix &subMat,
ScoreMatrix & three, ScoreMatrix & two, Sequence *seq,
DBReader<unsigned int> *dbr, size_t dbFrom, size_t dbTo, int kmerThr,
bool mask, bool maskLowerCaseMode, float maskProb, int targetSearchMode);
};

#endif
6 changes: 6 additions & 0 deletions src/prefiltering/IndexTable.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,9 @@ class IndexTable {
//idxer->reset();
while(s->hasNextKmer()){
const unsigned char * kmer = s->nextKmer();
if(s->kmerContainsX()){
continue;
}
const std::pair<size_t *, size_t> kmerList = kmerGenerator->generateKmerList(kmer);

//unsigned int kmerIdx = idxer->int2index(kmer, 0, kmerSize);
Expand Down Expand Up @@ -302,6 +305,9 @@ class IndexTable {
size_t kmerPos = 0;
while(s->hasNextKmer()){
const unsigned char * kmer = s->nextKmer();
if(s->kmerContainsX()){
continue;
}
std::pair<size_t *, size_t> scoreMatrix = kmerGenerator->generateKmerList(kmer);
if(kmerPos+scoreMatrix.second >= bufferSize){
*buffer = static_cast<IndexEntryLocalTmp*>(realloc(*buffer, sizeof(IndexEntryLocalTmp) * bufferSize*2));
Expand Down
29 changes: 18 additions & 11 deletions src/prefiltering/Prefiltering.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#include "Prefiltering.h"
#include "NucleotideMatrix.h"
#include "ReducedMatrix.h"
#include "ExtendedSubstitutionMatrix.h"
#include "SubstitutionMatrixProfileStates.h"
#include "DBWriter.h"
#include "QueryMatcherTaxonomyHook.h"
Expand Down Expand Up @@ -42,6 +41,7 @@ Prefiltering::Prefiltering(const std::string &queryDB,
scoringMatrixFile(par.scoringMatrixFile),
seedScoringMatrixFile(par.seedScoringMatrixFile),
targetSeqType(targetSeqType),
targetSearchMode(par.targetSearchMode),
maxResListLen(par.maxResListLen),
sensitivity(par.sensitivity),
maxSeqLen(par.maxSeqLen),
Expand All @@ -52,7 +52,8 @@ Prefiltering::Prefiltering(const std::string &queryDB,
aaBiasCorrectionScale(par.compBiasCorrectionScale),
covThr(par.covThr), covMode(par.covMode), includeIdentical(par.includeIdentity),
preloadMode(par.preloadMode),
threads(static_cast<unsigned int>(par.threads)), compressed(par.compressed) {
threads(static_cast<unsigned int>(par.threads)),
compressed(par.compressed) {
sameQTDB = isSameQTDB();

// init the substitution matrices
Expand Down Expand Up @@ -173,7 +174,8 @@ Prefiltering::Prefiltering(const std::string &queryDB,

takeOnlyBestKmer = (par.exactKmerMatching==1) ||
(Parameters::isEqualDbtype(targetSeqType, Parameters::DBTYPE_HMM_PROFILE) && Parameters::isEqualDbtype(querySeqType,Parameters::DBTYPE_AMINO_ACIDS)) ||
(Parameters::isEqualDbtype(targetSeqType, Parameters::DBTYPE_NUCLEOTIDES) && Parameters::isEqualDbtype(querySeqType,Parameters::DBTYPE_NUCLEOTIDES));
(Parameters::isEqualDbtype(targetSeqType, Parameters::DBTYPE_NUCLEOTIDES) && Parameters::isEqualDbtype(querySeqType,Parameters::DBTYPE_NUCLEOTIDES)) ||
(targetSearchMode == 1);

// memoryLimit in bytes
size_t memoryLimit=Util::computeMemory(par.splitMemoryLimit);
Expand Down Expand Up @@ -203,6 +205,13 @@ Prefiltering::Prefiltering(const std::string &queryDB,

Debug(Debug::INFO) << "Target database size: " << tdbr->getSize() << " type: " <<Parameters::getDbTypeName(targetSeqType) << "\n";

if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_AMINO_ACIDS)) {
kmerSubMat->alphabetSize = kmerSubMat->alphabetSize - 1;
_2merSubMatrix = getScoreMatrix(*kmerSubMat, 2);
_3merSubMatrix = getScoreMatrix(*kmerSubMat, 3);
kmerSubMat->alphabetSize = alphabetSize;
}

if (splitMode == Parameters::QUERY_DB_SPLIT) {
// create the whole index table
getIndexTable(0, 0, tdbr->getSize());
Expand All @@ -214,12 +223,7 @@ Prefiltering::Prefiltering(const std::string &queryDB,
EXIT(EXIT_FAILURE);
}

if (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_AMINO_ACIDS)) {
kmerSubMat->alphabetSize = kmerSubMat->alphabetSize - 1;
_2merSubMatrix = getScoreMatrix(*kmerSubMat, 2);
_3merSubMatrix = getScoreMatrix(*kmerSubMat, 3);
kmerSubMat->alphabetSize = alphabetSize;
}


if (par.taxonList.length() > 0) {
taxonomyHook = new QueryMatcherTaxonomyHook(targetDB, tdbr, par.taxonList);
Expand Down Expand Up @@ -519,7 +523,7 @@ void Prefiltering::getIndexTable(int split, size_t dbFrom, size_t dbSize) {
Sequence tseq(maxSeqLen, targetSeqType, kmerSubMat, kmerSize, spacedKmer, aaBiasCorrection, true, spacedKmerPattern);
int localKmerThr = (Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_HMM_PROFILE) ||
Parameters::isEqualDbtype(querySeqType, Parameters::DBTYPE_NUCLEOTIDES) ||
(Parameters::isEqualDbtype(targetSeqType, Parameters::DBTYPE_HMM_PROFILE) == false && takeOnlyBestKmer == true) ) ? 0 : kmerThr;
(Parameters::isEqualDbtype(targetSeqType, Parameters::DBTYPE_HMM_PROFILE) == false && targetSearchMode == 0 && takeOnlyBestKmer == true) ) ? 0 : kmerThr;

// remove X or N for seeding
int adjustAlphabetSize = (Parameters::isEqualDbtype(targetSeqType, Parameters::DBTYPE_NUCLEOTIDES) ||
Expand All @@ -530,7 +534,10 @@ void Prefiltering::getIndexTable(int split, size_t dbFrom, size_t dbSize) {
SequenceLookup **maskedLookup = maskMode == 1 || maskLowerCaseMode == 1 ? &sequenceLookup : NULL;

Debug(Debug::INFO) << "Index table k-mer threshold: " << localKmerThr << " at k-mer size " << kmerSize << " \n";
IndexBuilder::fillDatabase(indexTable, maskedLookup, unmaskedLookup, *kmerSubMat, &tseq, tdbr, dbFrom, dbFrom + dbSize, localKmerThr, maskMode, maskLowerCaseMode, maskProb);
IndexBuilder::fillDatabase(indexTable, maskedLookup, unmaskedLookup, *kmerSubMat,
_3merSubMatrix, _2merSubMatrix,
&tseq, tdbr, dbFrom, dbFrom + dbSize,
localKmerThr, maskMode, maskLowerCaseMode, maskProb, targetSearchMode);

// sequenceLookup has to be temporarily present to speed up masking
// afterwards its not needed anymore without diagonal scoring
Expand Down
1 change: 1 addition & 0 deletions src/prefiltering/Prefiltering.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ class Prefiltering {
MultiParam<NuclAA<std::string>> scoringMatrixFile;
MultiParam<NuclAA<std::string>> seedScoringMatrixFile;
int targetSeqType;
int targetSearchMode;
bool takeOnlyBestKmer;
size_t maxResListLen;

Expand Down
55 changes: 32 additions & 23 deletions src/prefiltering/PrefilteringIndexReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
BaseMatrix *subMat, int maxSeqLen,
bool hasSpacedKmer, const std::string &spacedKmerPattern,
bool compBiasCorrection, int alphabetSize, int kmerSize, int maskMode,
int maskLowerCase, float maskProb, int kmerThr, int splits, int indexSubset) {
int maskLowerCase, float maskProb, int kmerThr, int targetSearchMode, int splits,
int indexSubset) {

const int SPLIT_META = splits > 1 ? 0 : 0;
const int SPLIT_SEQS = splits > 1 ? 1 : 0;
Expand All @@ -82,27 +83,6 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
writer.writeData(metadataptr, sizeof(metadata), META, SPLIT_META);
writer.alignToPageSize(SPLIT_META);

if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false && indexSubset != Parameters::INDEX_SUBSET_NO_PREFILTER) {
int alphabetSize = subMat->alphabetSize;
subMat->alphabetSize = subMat->alphabetSize-1;
ScoreMatrix s3 = ExtendedSubstitutionMatrix::calcScoreMatrix(*subMat, 3);
ScoreMatrix s2 = ExtendedSubstitutionMatrix::calcScoreMatrix(*subMat, 2);
subMat->alphabetSize = alphabetSize;

char* serialized3mer = ScoreMatrix::serialize(s3);
Debug(Debug::INFO) << "Write SCOREMATRIX3MER (" << SCOREMATRIX3MER << ")\n";
writer.writeData(serialized3mer, ScoreMatrix::size(s3), SCOREMATRIX3MER, SPLIT_META);
writer.alignToPageSize(SPLIT_META);
ExtendedSubstitutionMatrix::freeScoreMatrix(s3);
free(serialized3mer);

char* serialized2mer = ScoreMatrix::serialize(s2);
Debug(Debug::INFO) << "Write SCOREMATRIX2MER (" << SCOREMATRIX2MER << ")\n";
writer.writeData(serialized2mer, ScoreMatrix::size(s2), SCOREMATRIX2MER, SPLIT_META);
writer.alignToPageSize(SPLIT_META);
ExtendedSubstitutionMatrix::freeScoreMatrix(s2);
free(serialized2mer);
}

Debug(Debug::INFO) << "Write SCOREMATRIXNAME (" << SCOREMATRIXNAME << ")\n";
char* subData = BaseMatrix::serialize(subMat->matrixName, subMat->matrixData);
Expand Down Expand Up @@ -213,6 +193,29 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
if (indexSubset == Parameters::INDEX_SUBSET_NO_PREFILTER) {
splits = 0;
}

ScoreMatrix s3;
ScoreMatrix s2;
if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false && indexSubset != Parameters::INDEX_SUBSET_NO_PREFILTER) {
int alphabetSize = subMat->alphabetSize;
subMat->alphabetSize = subMat->alphabetSize-1;
s3 = ExtendedSubstitutionMatrix::calcScoreMatrix(*subMat, 3);
s2 = ExtendedSubstitutionMatrix::calcScoreMatrix(*subMat, 2);
subMat->alphabetSize = alphabetSize;

char* serialized3mer = ScoreMatrix::serialize(s3);
Debug(Debug::INFO) << "Write SCOREMATRIX3MER (" << SCOREMATRIX3MER << ")\n";
writer.writeData(serialized3mer, ScoreMatrix::size(s3), SCOREMATRIX3MER, SPLIT_META);
writer.alignToPageSize(SPLIT_META);
free(serialized3mer);

char* serialized2mer = ScoreMatrix::serialize(s2);
Debug(Debug::INFO) << "Write SCOREMATRIX2MER (" << SCOREMATRIX2MER << ")\n";
writer.writeData(serialized2mer, ScoreMatrix::size(s2), SCOREMATRIX2MER, SPLIT_META);
writer.alignToPageSize(SPLIT_META);
free(serialized2mer);
}

for (int s = 0; s < splits; s++) {
size_t dbFrom = 0;
size_t dbSize = 0;
Expand All @@ -226,7 +229,8 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
IndexBuilder::fillDatabase(&indexTable,
(maskMode == 1 || maskLowerCase == 1) ? &sequenceLookup : NULL,
(maskMode == 0 && maskLowerCase == 0) ? &sequenceLookup : NULL,
*subMat, &seq, dbr1, dbFrom, dbFrom + dbSize, kmerThr, maskMode, maskLowerCase, maskProb);
*subMat, s3, s2, &seq, dbr1, dbFrom, dbFrom + dbSize, kmerThr,
maskMode, maskLowerCase, maskProb, targetSearchMode);
indexTable.printStatistics(subMat->num2aa);

if (sequenceLookup == NULL) {
Expand Down Expand Up @@ -282,6 +286,11 @@ void PrefilteringIndexReader::createIndexFile(const std::string &outDB,
writer.alignToPageSize(SPLIT_INDX + s);
}

if (Parameters::isEqualDbtype(seqType, Parameters::DBTYPE_HMM_PROFILE) == false && indexSubset != Parameters::INDEX_SUBSET_NO_PREFILTER) {
ExtendedSubstitutionMatrix::freeScoreMatrix(s3);
ExtendedSubstitutionMatrix::freeScoreMatrix(s2);
}

writer.close(false);
}

Expand Down
2 changes: 1 addition & 1 deletion src/prefiltering/PrefilteringIndexReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class PrefilteringIndexReader {
DBReader<unsigned int> *alndbr,
BaseMatrix *seedSubMat, int maxSeqLen, bool spacedKmer, const std::string &spacedKmerPattern,
bool compBiasCorrection, int alphabetSize, int kmerSize, int maskMode,
int maskLowerCase, float maskProb, int kmerThr, int splits, int indexSubset = 0);
int maskLowerCase, float maskProb, int kmerThr, int targetSearchMode, int splits, int indexSubset = 0);

static DBReader<unsigned int> *openNewHeaderReader(DBReader<unsigned int>*dbr, unsigned int dataIdx, unsigned int indexIdx, int threads, bool touchIndex, bool touchData);

Expand Down
Loading

0 comments on commit 71dd32e

Please sign in to comment.