Changes needed for cluster search

soedinglab · Aug 7, 2023 · ad6dfc6 · ad6dfc6
1 parent eb01b5b
commit ad6dfc6
Show file tree

Hide file tree

Showing 9 changed files with 129 additions and 21 deletions.
diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h
@@ -108,6 +108,7 @@ extern int search(int argc, const char **argv, const Command& command);
 extern int linsearch(int argc, const char **argv, const Command& command);
 extern int sortresult(int argc, const char **argv, const Command& command);
 extern int splitdb(int argc, const char **argv, const Command& command);
+extern int setextendeddbtype(int argc, const char **argv, const Command& command);
 extern int splitsequence(int argc, const char **argv, const Command& command);
 extern int subtractdbs(int argc, const char **argv, const Command& command);
 extern int suffixid(int argc, const char **argv, const Command& command);

diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp
@@ -793,8 +793,13 @@ std::vector<Command> baseCommands = {
                 CITATION_MMSEQS2, {{"resultDBLeft", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
                                           {"resultDBRight", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
                                           {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb }}},
-
-
+        {"setextendeddbtype",                 setextendeddbtype,                 &par.extendeddbtype,                 COMMAND_DB,
+                "Write an extended DB ",
+                "# Print entries with keys 1, 2 and 3 from a sequence DB to stdout\n"
+                "mmseqs setextendedbtype db --extended-dbtype 2\n",
+                "Martin Steinegger <martin.steinegger@snu.ac.kr>",
+                "<i:DB>",
+                CITATION_MMSEQS2, {{"DB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::allDb }}},
 
         {"view",                 view,                 &par.view,                 COMMAND_DB,
                 "Print DB entries given in --id-list to stdout",

diff --git a/src/commons/IndexReader.h b/src/commons/IndexReader.h
@@ -74,8 +74,28 @@ class IndexReader {
         }
 
         if (sequenceReader == NULL) {
-            if ((databaseType & USER_SELECT) == false && databaseType & (HEADERS | SRC_HEADERS)) {
-                failSuffix = "_h";
+            if((databaseType & USER_SELECT) == false && failSuffix == "" ){
+                if (databaseType & HEADERS) {
+                    failSuffix = "_h";
+                } else if (databaseType & SRC_HEADERS) {
+                    failSuffix = "_seq_h";
+                    if(FileUtil::fileExists((dataName + "_seq_h.dbtype").c_str())==false){
+                        failSuffix = "_h";
+                    }
+                } else if (databaseType & SRC_SEQUENCES) {
+                    failSuffix = "_seq";
+                    if(FileUtil::fileExists((dataName + "_seq.dbtype").c_str())==false){
+                        failSuffix = "";
+                    }
+                } else if (databaseType & SEQUENCES) {
+                    failSuffix = "";
+                } else if (databaseType & ALIGNMENTS) {
+                    if(FileUtil::fileExists((dataName + "_clu.dbtype").c_str())){
+                        failSuffix = "_clu";
+                    }else{
+                        failSuffix = "_aln";
+                    }
+                }
             }
             sequenceReader = new DBReader<unsigned int>(
                     (dataName + failSuffix).c_str(), (dataName + failSuffix + ".index").c_str(),

diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp
@@ -188,7 +188,8 @@ Parameters::Parameters():
         // indexdb
         PARAM_CHECK_COMPATIBLE(PARAM_CHECK_COMPATIBLE_ID, "--check-compatible", "Check compatible", "0: Always recreate index, 1: Check if recreating index is needed, 2: Fail if index is incompatible", typeid(int), (void *) &checkCompatible, "^[0-2]{1}$", MMseqsParameter::COMMAND_MISC),
         PARAM_SEARCH_TYPE(PARAM_SEARCH_TYPE_ID, "--search-type", "Search type", "Search type 0: auto 1: amino acid, 2: translated, 3: nucleotide, 4: translated nucleotide alignment", typeid(int), (void *) &searchType, "^[0-4]{1}"),
-        PARAM_INDEX_SUBSET(PARAM_INDEX_SUBSET_ID, "--index-subset", "Index subset", "Create specialized index with subset of entries\n0: normal index\n1: index without headers\n2: index without prefiltering data\nFlags can be combined bit wise", typeid(int), (void *) &indexSubset, "^[0-3]{1}", MMseqsParameter::COMMAND_EXPERT),
+        PARAM_INDEX_SUBSET(PARAM_INDEX_SUBSET_ID, "--index-subset", "Index subset", "Create specialized index with subset of entries\n0: normal index\n1: index without headers\n2: index without prefiltering data\n4: index without aln (for cluster db)\nFlags can be combined bit wise", typeid(int), (void *) &indexSubset, "^[0-7]{1}", MMseqsParameter::COMMAND_EXPERT),
+        PARAM_INDEX_DBSUFFIX(PARAM_INDEX_DBSUFFIX_ID, "--index-dbsuffix", "Index dbsuffix", "A suffix of the db (used for cluster dbs)", typeid(std::string), (void *) &indexDbsuffix, "", MMseqsParameter::COMMAND_HIDDEN),
         // createdb
         PARAM_USE_HEADER(PARAM_USE_HEADER_ID, "--use-fasta-header", "Use fasta header", "Use the id parsed from the fasta header as the index key instead of using incrementing numeric identifiers", typeid(bool), (void *) &useHeader, ""),
         PARAM_ID_OFFSET(PARAM_ID_OFFSET_ID, "--id-offset", "Offset of numeric ids", "Numeric ids in index file are offset by this value", typeid(int), (void *) &identifierOffset, "^(0|[1-9]{1}[0-9]*)$"),
@@ -197,6 +198,8 @@ Parameters::Parameters():
         PARAM_SHUFFLE(PARAM_SHUFFLE_ID, "--shuffle", "Shuffle input database", "Shuffle input database", typeid(bool), (void *) &shuffleDatabase, ""),
         PARAM_WRITE_LOOKUP(PARAM_WRITE_LOOKUP_ID, "--write-lookup", "Write lookup file", "write .lookup file containing mapping from internal id, fasta id and file number", typeid(int), (void *) &writeLookup, "^[0-1]{1}", MMseqsParameter::COMMAND_EXPERT),
         PARAM_USE_HEADER_FILE(PARAM_USE_HEADER_FILE_ID, "--use-header-file", "Use header DB", "use the sequence header DB instead of the body to map the entry keys", typeid(bool), (void *) &useHeaderFile, ""),
+        // setextendeddbtype
+        PARAM_EXTENDED_DBTYPE(PARAM_EXTENDED_DBTYPE_ID, "--extended-dbtype", "Extended dbtype", "Set extended dbtype 1: compressed, 2: need src, 4: context pseudoe cnts", typeid(int), (void *) &extendedDbtype, "^[0-4]{1}"),
         // splitsequence
         PARAM_SEQUENCE_OVERLAP(PARAM_SEQUENCE_OVERLAP_ID, "--sequence-overlap", "Overlap between sequences", "Overlap between sequences", typeid(int), (void *) &sequenceOverlap, "^(0|[1-9]{1}[0-9]*)$"),
         PARAM_SEQUENCE_SPLIT_MODE(PARAM_SEQUENCE_SPLIT_MODE_ID, "--sequence-split-mode", "Sequence split mode", "Sequence split mode 0: copy data, 1: soft link data and write new index,", typeid(int), (void *) &sequenceSplitMode, "^[0-1]{1}$"),
@@ -752,6 +755,7 @@ Parameters::Parameters():
     indexdb.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
     indexdb.push_back(&PARAM_MAX_SEQ_LEN);
     indexdb.push_back(&PARAM_MAX_SEQS);
+    indexdb.push_back(&PARAM_INDEX_DBSUFFIX);
     indexdb.push_back(&PARAM_MASK_RESIDUES);
     indexdb.push_back(&PARAM_MASK_PROBABILTY);
     indexdb.push_back(&PARAM_MASK_LOWER_CASE);
@@ -918,6 +922,10 @@ Parameters::Parameters():
     subtractdbs.push_back(&PARAM_COMPRESSED);
     subtractdbs.push_back(&PARAM_V);
 
+    // setextendeddbtype
+    extendeddbtype.push_back(&PARAM_EXTENDED_DBTYPE);
+    extendeddbtype.push_back(&PARAM_V);
+
     // clusthash
     clusthash.push_back(&PARAM_SUB_MAT);
     clusthash.push_back(&PARAM_ALPH_SIZE);
@@ -2324,6 +2332,7 @@ void Parameters::setDefaults() {
     checkCompatible = 0;
     searchType = SEARCH_TYPE_AUTO;
     indexSubset = INDEX_SUBSET_NORMAL;
+    indexDbsuffix = "";
 
     // createdb
     createdbMode = SEQUENCE_SPLIT_MODE_HARD;

diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h
@@ -186,6 +186,8 @@ class Parameters {
     static const int INDEX_SUBSET_NORMAL = 0;
     static const int INDEX_SUBSET_NO_HEADERS = 1;
     static const int INDEX_SUBSET_NO_PREFILTER = 2;
+    static const int INDEX_SUBSET_NO_ALIGNMENT = 4;
+
 
     static std::vector<int> getOutputFormat(int formatMode, const std::string &outformat, bool &needSequences, bool &needBacktrace, bool &needFullHeaders,
                                             bool &needLookup, bool &needSource, bool &needTaxonomyMapping, bool &needTaxonomy);
@@ -546,6 +548,7 @@ class Parameters {
     int checkCompatible;
     int searchType;
     int indexSubset;
+    std::string indexDbsuffix;
 
     // createdb
     int identifierOffset;
@@ -610,6 +613,9 @@ class Parameters {
     float overlap;
     int msaType;
 
+    // setextendeddbtype
+    int extendedDbtype;
+
     // extractalignedregion
     int extractMode;
 
@@ -892,6 +898,7 @@ class Parameters {
     PARAMETER(PARAM_CHECK_COMPATIBLE)
     PARAMETER(PARAM_SEARCH_TYPE)
     PARAMETER(PARAM_INDEX_SUBSET)
+    PARAMETER(PARAM_INDEX_DBSUFFIX)
 
     // createdb
     PARAMETER(PARAM_USE_HEADER) // also used by extractorfs
@@ -904,6 +911,9 @@ class Parameters {
     // convert2fasta
     PARAMETER(PARAM_USE_HEADER_FILE)
 
+    // setextendedbtype
+    PARAMETER(PARAM_EXTENDED_DBTYPE)
+
     // split sequence
     PARAMETER(PARAM_SEQUENCE_OVERLAP)
     PARAMETER(PARAM_SEQUENCE_SPLIT_MODE)
@@ -1116,6 +1126,7 @@ class Parameters {
     std::vector<MMseqsParameter*> offsetalignment;
     std::vector<MMseqsParameter*> proteinaln2nucl;
     std::vector<MMseqsParameter*> subtractdbs;
+    std::vector<MMseqsParameter*> extendeddbtype;
     std::vector<MMseqsParameter*> diff;
     std::vector<MMseqsParameter*> concatdbs;
     std::vector<MMseqsParameter*> mergedbs;

diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt
@@ -57,8 +57,10 @@ set(util_source_files
         util/cpmvrmlndb.cpp
         util/extractframes.cpp
         util/sequence2profile.cpp
+        util/setextendeddbtype.cpp
         util/sortresult.cpp
         util/splitdb.cpp
+        util/setextendeddbtype.cpp
         util/splitsequence.cpp
         util/subtractdbs.cpp
         util/summarizealis.cpp

diff --git a/src/util/indexdb.cpp b/src/util/indexdb.cpp
@@ -51,18 +51,39 @@ int indexdb(int argc, const char **argv, const Command &command) {
 
     const bool sameDB = (par.db1 == par.db2);
     std::string alnDbtypeFile = par.db1 + "_aln.dbtype";
-    std::string seqDbtypeFile = par.db1 + "_seq.dbtype";
-    const bool ppDB = FileUtil::fileExists(alnDbtypeFile.c_str()) && FileUtil::fileExists(seqDbtypeFile.c_str());
-
-    std::string db2 = ppDB ? par.db1 + "_seq" : par.db2;
-    std::string db2Index = ppDB ? par.db1 + "_seq.index" : par.db2Index;
+    std::string alnFile = par.db1 + "_aln";
+    std::string alnIndexFile = par.db1 + "_aln.index";
+    if(FileUtil::fileExists(alnDbtypeFile.c_str()) == false){
+        alnDbtypeFile =  par.db1 + "_clu.dbtype";
+        alnFile = par.db1 + "_clu";
+        alnIndexFile = par.db1 + "_clu.index";
+    }
 
-    std::string hdr1 = ppDB ? par.db1 + "_seq_h" : par.hdr1;
-    std::string hdr1Index = ppDB ? par.db1 + "_seq_h.index" : par.hdr1Index;
 
     DBReader<unsigned int> dbr(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
     dbr.open(DBReader<unsigned int>::NOSORT);
 
+    // remove par.indexDbsuffix from db1
+    std::string seqDb = par.db1 + "_seq";
+    std::string seqDbIndex = par.db1 + "_seq.index";
+    std::string seqDbtypeFile = par.db1 + "_seq.dbtype";
+    if (par.indexDbsuffix != "") {
+        std::string::size_type pos = par.db1.find(par.indexDbsuffix);
+        if (pos != std::string::npos) {
+            par.db1 = par.db1.substr(0, pos);
+        }
+        seqDb         = par.db1 + "_seq" + par.indexDbsuffix;
+        seqDbIndex    = par.db1 + "_seq" + par.indexDbsuffix + ".index";
+        seqDbtypeFile = par.db1 + "_seq" + par.indexDbsuffix + ".dbtype";
+    }
+    const bool ppDB = FileUtil::fileExists(alnDbtypeFile.c_str()) && FileUtil::fileExists(seqDbtypeFile.c_str());
+
+    std::string db2 = ppDB ? seqDb : par.db2;
+    std::string db2Index = ppDB ? seqDbIndex : par.db2Index;
+
+    std::string hdr1 = ppDB ? seqDb + "_h" : par.hdr1;
+    std::string hdr1Index = ppDB ?  seqDb + "_h.index" : par.hdr1Index;
+
     DBReader<unsigned int> *dbr2 = NULL;
     if ((sameDB == false) || ppDB) {
         dbr2 = new DBReader<unsigned int>(db2.c_str(), db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
@@ -102,7 +123,7 @@ int indexdb(int argc, const char **argv, const Command &command) {
     // query seq type is actually unknown here, but if we pass DBTYPE_HMM_PROFILE then its +20 k-score
     int kmerScore = Prefiltering::getKmerThreshold(par.sensitivity, isProfileSearch, contextPseudoCnts, par.kmerScore.values, par.kmerSize);
 
-    const std::string& baseDB = ppDB ? par.db1 : par.db2;
+    const std::string& baseDB = ppDB ? par.db1 + par.indexDbsuffix : par.db2;
     std::string indexDB = PrefilteringIndexReader::indexName(baseDB);
 
     int status = EXIT_SUCCESS;
@@ -150,8 +171,10 @@ int indexdb(int argc, const char **argv, const Command &command) {
         }
 
         DBReader<unsigned int> *alndbr = NULL;
-        if (ppDB == true) {
-            alndbr = new DBReader<unsigned int>((par.db1 + "_aln").c_str(), (par.db1 + "_aln.index").c_str(), par.threads, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
+        const bool noAlignment = (par.indexSubset & Parameters::INDEX_SUBSET_NO_ALIGNMENT) != 0;
+        if (ppDB == true && noAlignment == false) {
+            alndbr = new DBReader<unsigned int>(alnFile.c_str(), alnIndexFile.c_str(),
+                                                par.threads, DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
             alndbr->open(DBReader<unsigned int>::NOSORT);
         }
 

diff --git a/src/util/mergeresultsbyset.cpp b/src/util/mergeresultsbyset.cpp
@@ -2,6 +2,7 @@
 #include "DBReader.h"
 #include "DBWriter.h"
 #include "Util.h"
+#include "IndexReader.h"
 
 #ifdef OPENMP
 #include <omp.h>
@@ -14,10 +15,26 @@ int mergeresultsbyset(int argc, const char **argv, const Command &command) {
     DBReader<unsigned int> setReader(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
     setReader.open(DBReader<unsigned int>::LINEAR_ACCCESS);
 
-    DBReader<unsigned int> resultReader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
-    resultReader.open(DBReader<unsigned int>::NOSORT);
+//    DBReader<unsigned int> resultReader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
+//    resultReader.open(DBReader<unsigned int>::NOSORT);
 
-    DBWriter dbw(par.db3.c_str(), par.db3Index.c_str(), par.threads, par.compressed, resultReader.getDbtype());
+
+    const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP);
+    unsigned int databaseType = IndexReader::USER_SELECT;
+    int dbtype = FileUtil::parseDbType(par.db2.c_str());
+    if(Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_INDEX_DB)||
+       Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_AMINO_ACIDS)||
+       Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_NUCLEOTIDES)||
+       Parameters::isEqualDbtype(dbtype, Parameters::DBTYPE_HMM_PROFILE)){
+        databaseType = IndexReader::ALIGNMENTS;
+    }
+    IndexReader resultReader(par.db2, par.threads,
+                              databaseType,
+                             (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0);
+
+    int dbType = resultReader.sequenceReader->getDbtype();
+    dbType = DBReader<unsigned int>::setExtendedDbtype(dbType, Parameters::DBTYPE_EXTENDED_INDEX_NEED_SRC);
+    DBWriter dbw(par.db3.c_str(), par.db3Index.c_str(), par.threads, par.compressed, dbType);
     dbw.open();
 #pragma omp parallel
     {
@@ -35,20 +52,19 @@ int mergeresultsbyset(int argc, const char **argv, const Command &command) {
             while (*data != '\0'){
                 Util::parseKey(data, dbKey);
                 unsigned int key = Util::fast_atoi<unsigned int>(dbKey);
-                size_t id = resultReader.getId(key);
+                size_t id = resultReader.sequenceReader->getId(key);
                 if (id == UINT_MAX) {
                     Debug(Debug::ERROR) << "Invalid key " << key << " in entry " << i << ".\n";
                     EXIT(EXIT_FAILURE);
                 }
-                buffer.append(resultReader.getData(id, thread_idx));
+                buffer.append(resultReader.sequenceReader->getData(id, thread_idx));
                 data = Util::skipLine(data);
             }
             dbw.writeData(buffer.c_str(), buffer.length(), setReader.getDbKey(i), thread_idx);
             buffer.clear();
         }
     }
     dbw.close();
-    resultReader.close();
     setReader.close();
 
     return EXIT_SUCCESS;

diff --git a/src/util/setextendeddbtype.cpp b/src/util/setextendeddbtype.cpp
@@ -0,0 +1,21 @@
+#include "Parameters.h"
+#include "Util.h"
+#include "Debug.h"
+#include "DBReader.h"
+#include "DBWriter.h"
+#include "HeaderSummarizer.h"
+
+#ifdef OPENMP
+#include <omp.h>
+#endif
+
+int setextendeddbtype(int argc, const char **argv, const Command& command) {
+    Parameters& par = Parameters::getInstance();
+    par.parseParameters(argc, argv, command, true, 0, 0);
+    int dbtype = FileUtil::parseDbType(par.db1.c_str());
+    // check if dbtype uses isCompressed flag
+    bool isCompressed = (dbtype & (1 << 31));
+    dbtype = DBReader<unsigned int>::setExtendedDbtype(dbtype,  par.extendedDbtype);
+    DBWriter::writeDbtypeFile(par.db1.c_str(), dbtype, isCompressed);
+    return EXIT_SUCCESS;
+}