From 9a0df0d25a92edd902a3590e10c25e0d51b9879f Mon Sep 17 00:00:00 2001 From: Martin Steinegger Date: Wed, 22 Sep 2021 15:16:48 +0900 Subject: [PATCH] Add pairaln --- src/CommandDeclarations.h | 1 + src/MMseqsBase.cpp | 11 +++- src/util/CMakeLists.txt | 1 + src/util/pairaln.cpp | 117 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 128 insertions(+), 2 deletions(-) create mode 100644 src/util/pairaln.cpp diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h index f5bd305f1..1942617a6 100644 --- a/src/CommandDeclarations.h +++ b/src/CommandDeclarations.h @@ -46,6 +46,7 @@ extern int tsv2exprofiledb(int argc, const char **argv, const Command& command); extern int enrich(int argc, const char **argv, const Command& command); extern int expandaln(int argc, const char **argv, const Command& command); extern int expand2profile(int argc, const char **argv, const Command& command); +extern int pairaln(int argc, const char **argv, const Command& command); extern int countkmer(int argc, const char **argv, const Command& command); extern int extractalignedregion(int argc, const char **argv, const Command& command); extern int extractdomains(int argc, const char **argv, const Command& command); diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index a5108ac1f..e5a6751fc 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -1134,8 +1134,15 @@ std::vector baseCommands = { {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb }, {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb }, {"profileDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::profileDb }}}, - - + {"pairaln", pairaln, &par.threadsandcompression, COMMAND_EXPERT, + "Pair sequences to match best protein A and B from an species", + NULL, + "Martin Steinegger ", + " ", + CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb }, + {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_TAXONOMY, &DbValidator::sequenceDb }, + {"alnDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }, + {"alnDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}}, {"diffseqdbs", diffseqdbs, &par.diff, COMMAND_SPECIAL, "Compute diff of two sequence DBs", // "It creates 3 filtering files, that can be used in conjunction with \"createsubdb\" tool.\nThe first file contains the keys that has been removed from DBold to DBnew.\nThe second file maps the keys of the kept sequences from DBold to DBnew.\nThe third file contains the keys of the sequences that have been added in DBnew.", diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index c0dab9a6c..9c1206097 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -37,6 +37,7 @@ set(util_source_files util/msa2result.cpp util/nrtotaxmapping.cpp util/countkmer.cpp + util/pairaln.cpp util/prefixid.cpp util/profile2pssm.cpp util/profile2seq.cpp diff --git a/src/util/pairaln.cpp b/src/util/pairaln.cpp new file mode 100644 index 000000000..8d42dde58 --- /dev/null +++ b/src/util/pairaln.cpp @@ -0,0 +1,117 @@ +#include "Util.h" +#include "Parameters.h" +#include "Matcher.h" +#include "Debug.h" +#include "DBReader.h" +#include "DBWriter.h" +#include "IndexReader.h" +#include "MemoryMapped.h" +#include "NcbiTaxonomy.h" +#include "MappingReader.h" + +#define ZSTD_STATIC_LINKING_ONLY +#include + +#include + +#ifdef OPENMP +#include +#endif + +int pairaln(int argc, const char **argv, const Command& command) { + Parameters &par = Parameters::getInstance(); + par.parseParameters(argc, argv, command, true, 0, 0); + + const bool sameDB = par.db1.compare(par.db2) == 0 ? true : false; + const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP); + + std::string db2NoIndexName = PrefilteringIndexReader::dbPathWithoutIndex(par.db2); + MappingReader* mapping = new MappingReader(db2NoIndexName); + const int dbaccessMode = (DBReader::USE_INDEX | DBReader::USE_DATA); + IndexReader qDbr(par.db1, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); + IndexReader *tDbr; + if (sameDB) { + tDbr = &qDbr; + } else { + tDbr = new IndexReader(par.db2, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); + } + + DBReader alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + alnDbr.open(DBReader::NOSORT); + if(alnDbr.getSize() % 2 != 0){ + Debug(Debug::ERROR) << "Alignment database does not seem to be paired\n"; + EXIT(EXIT_FAILURE); + } + size_t localThreads = 1; +#ifdef OPENMP + localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t)1); +#endif + + DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), localThreads, par.compressed, Parameters::DBTYPE_ALIGNMENT_RES); + resultWriter.open(); + + Debug::Progress progress(alnDbr.getSize()); +#pragma omp parallel num_threads(localThreads) + { + unsigned int thread_idx = 0; +#ifdef OPENMP + thread_idx = static_cast(omp_get_thread_num()); +#endif + char buffer[1024 + 32768*4]; + std::string result; + result.reserve(1024 * 1024); + std::vector resultA; + resultA.reserve(100000); + std::vector resultB; + resultB.reserve(100000); + std::unordered_map findPair; + std::string outputA; + outputA.reserve(100000); + std::string outputB; + outputB.reserve(100000); +#pragma omp for schedule(static, 2) + for (size_t i = 0; i < alnDbr.getSize(); i+=2) { + progress.updateProgress(); + progress.updateProgress(); + resultA.clear(); + resultB.clear(); + outputA.clear(); + outputB.clear(); + Matcher::readAlignmentResults(resultA, alnDbr.getData(i, thread_idx), true); + Matcher::readAlignmentResults(resultB, alnDbr.getData(i+1, thread_idx), true); + + for (size_t resIdx = 0; resIdx < resultA.size(); ++resIdx) { + unsigned int taxon = mapping->lookup(resultA[resIdx].dbKey); + if(findPair.find(taxon) == findPair.end()){ + findPair.insert({taxon, &resultA[resIdx]}); + } + } + // find pairs + for (size_t resIdx = 0; resIdx < resultB.size(); ++resIdx) { + unsigned int taxon = mapping->lookup(resultB[resIdx].dbKey); + // found pair! + if(findPair.find(taxon) != findPair.end()) { + bool hasBacktrace = (resultB[resIdx].backtrace.size() > 0); + size_t len = Matcher::resultToBuffer(buffer, *findPair[taxon], hasBacktrace, false, false); + outputA.append(buffer, len); + len = Matcher::resultToBuffer(buffer, resultB[resIdx], hasBacktrace, false, false); + outputB.append(buffer, len); + findPair.erase (taxon); + } + } + resultWriter.writeData(outputA.c_str(), outputA.length(), alnDbr.getDbKey(i), thread_idx); + resultWriter.writeData(outputB.c_str(), outputB.length(), alnDbr.getDbKey(i+1), thread_idx); + } + } + + resultWriter.close(); + // clean up + delete mapping; + alnDbr.close(); + if (sameDB == false) { + delete tDbr; + } + + return 0; +} +