From 9a0df0d25a92edd902a3590e10c25e0d51b9879f Mon Sep 17 00:00:00 2001
From: Martin Steinegger <themartinsteinegger@gmail.com>
Date: Wed, 22 Sep 2021 15:16:48 +0900
Subject: [PATCH] Add pairaln

---
 src/CommandDeclarations.h |   1 +
 src/MMseqsBase.cpp        |  11 +++-
 src/util/CMakeLists.txt   |   1 +
 src/util/pairaln.cpp      | 117 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 128 insertions(+), 2 deletions(-)
 create mode 100644 src/util/pairaln.cpp
diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h
index f5bd305f1..1942617a6 100644
--- a/src/CommandDeclarations.h
+++ b/src/CommandDeclarations.h
@@ -46,6 +46,7 @@ extern int tsv2exprofiledb(int argc, const char **argv, const Command& command);
 extern int enrich(int argc, const char **argv, const Command& command);
 extern int expandaln(int argc, const char **argv, const Command& command);
 extern int expand2profile(int argc, const char **argv, const Command& command);
+extern int pairaln(int argc, const char **argv, const Command& command);
 extern int countkmer(int argc, const char **argv, const Command& command);
 extern int extractalignedregion(int argc, const char **argv, const Command& command);
 extern int extractdomains(int argc, const char **argv, const Command& command);
diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp
index a5108ac1f..e5a6751fc 100644
--- a/src/MMseqsBase.cpp
+++ b/src/MMseqsBase.cpp
@@ -1134,8 +1134,15 @@ std::vector<Command> baseCommands = {
                                           {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
                                           {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
                                           {"profileDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::profileDb }}},
-
-
+        {"pairaln",      pairaln,        &par.threadsandcompression,       COMMAND_EXPERT,
+                "Pair sequences to match best protein A and B from an species",
+                NULL,
+                "Martin Steinegger <martin.steinegger@snu.ac.kr>",
+                "<i:queryDB> <i:targetDB> <i:alnDB> <o:alnDB|ca3mDB>",
+                CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                          {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_TAXONOMY, &DbValidator::sequenceDb },
+                                          {"alnDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                          {"alnDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
         {"diffseqdbs",           diffseqdbs,           &par.diff,                 COMMAND_SPECIAL,
                 "Compute diff of two sequence DBs",
 //                "It creates 3 filtering files, that can be used in conjunction with \"createsubdb\" tool.\nThe first file contains the keys that has been removed from DBold to DBnew.\nThe second file maps the keys of the kept sequences from DBold to DBnew.\nThe third file contains the keys of the sequences that have been added in DBnew.",
diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt
index c0dab9a6c..9c1206097 100644
--- a/src/util/CMakeLists.txt
+++ b/src/util/CMakeLists.txt
@@ -37,6 +37,7 @@ set(util_source_files
         util/msa2result.cpp
         util/nrtotaxmapping.cpp
         util/countkmer.cpp
+        util/pairaln.cpp
         util/prefixid.cpp
         util/profile2pssm.cpp
         util/profile2seq.cpp
diff --git a/src/util/pairaln.cpp b/src/util/pairaln.cpp
new file mode 100644
index 000000000..8d42dde58
--- /dev/null
+++ b/src/util/pairaln.cpp
@@ -0,0 +1,117 @@
+#include "Util.h"
+#include "Parameters.h"
+#include "Matcher.h"
+#include "Debug.h"
+#include "DBReader.h"
+#include "DBWriter.h"
+#include "IndexReader.h"
+#include "MemoryMapped.h"
+#include "NcbiTaxonomy.h"
+#include "MappingReader.h"
+
+#define ZSTD_STATIC_LINKING_ONLY
+#include <zstd.h>
+
+#include <map>
+
+#ifdef OPENMP
+#include <omp.h>
+#endif
+
+int pairaln(int argc, const char **argv, const Command& command) {
+    Parameters &par = Parameters::getInstance();
+    par.parseParameters(argc, argv, command, true, 0, 0);
+
+    const bool sameDB = par.db1.compare(par.db2) == 0 ? true : false;
+    const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP);
+
+    std::string db2NoIndexName = PrefilteringIndexReader::dbPathWithoutIndex(par.db2);
+    MappingReader* mapping = new MappingReader(db2NoIndexName);
+    const int dbaccessMode = (DBReader<unsigned int>::USE_INDEX | DBReader<unsigned int>::USE_DATA);
+    IndexReader qDbr(par.db1, par.threads,  IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode);
+    IndexReader *tDbr;
+    if (sameDB) {
+        tDbr = &qDbr;
+    } else {
+        tDbr = new IndexReader(par.db2, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode);
+    }
+
+    DBReader<unsigned int> alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
+    alnDbr.open(DBReader<unsigned int>::NOSORT);
+    if(alnDbr.getSize() % 2 != 0){
+        Debug(Debug::ERROR) << "Alignment database does not seem to be paired\n";
+        EXIT(EXIT_FAILURE);
+    }
+    size_t localThreads = 1;
+#ifdef OPENMP
+    localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t)1);
+#endif
+
+    DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), localThreads,  par.compressed, Parameters::DBTYPE_ALIGNMENT_RES);
+    resultWriter.open();
+
+    Debug::Progress progress(alnDbr.getSize());
+#pragma omp parallel num_threads(localThreads)
+    {
+        unsigned int thread_idx = 0;
+#ifdef OPENMP
+        thread_idx = static_cast<unsigned int>(omp_get_thread_num());
+#endif
+        char buffer[1024 + 32768*4];
+        std::string result;
+        result.reserve(1024 * 1024);
+        std::vector<Matcher::result_t> resultA;
+        resultA.reserve(100000);
+        std::vector<Matcher::result_t> resultB;
+        resultB.reserve(100000);
+        std::unordered_map<unsigned int, Matcher::result_t *> findPair;
+        std::string outputA;
+        outputA.reserve(100000);
+        std::string outputB;
+        outputB.reserve(100000);
+#pragma omp  for schedule(static, 2)
+        for (size_t i = 0; i < alnDbr.getSize(); i+=2) {
+            progress.updateProgress();
+            progress.updateProgress();
+            resultA.clear();
+            resultB.clear();
+            outputA.clear();
+            outputB.clear();
+            Matcher::readAlignmentResults(resultA, alnDbr.getData(i, thread_idx), true);
+            Matcher::readAlignmentResults(resultB, alnDbr.getData(i+1, thread_idx), true);
+
+            for (size_t resIdx = 0; resIdx < resultA.size(); ++resIdx) {
+                unsigned int taxon = mapping->lookup(resultA[resIdx].dbKey);
+                if(findPair.find(taxon) == findPair.end()){
+                    findPair.insert({taxon, &resultA[resIdx]});
+                }
+            }
+            // find pairs
+            for (size_t resIdx = 0; resIdx < resultB.size(); ++resIdx) {
+                unsigned int taxon = mapping->lookup(resultB[resIdx].dbKey);
+                // found pair!
+                if(findPair.find(taxon) != findPair.end()) {
+                    bool hasBacktrace = (resultB[resIdx].backtrace.size() > 0);
+                    size_t len = Matcher::resultToBuffer(buffer, *findPair[taxon], hasBacktrace, false, false);
+                    outputA.append(buffer, len);
+                    len = Matcher::resultToBuffer(buffer, resultB[resIdx], hasBacktrace, false, false);
+                    outputB.append(buffer, len);
+                    findPair.erase (taxon);
+                }
+            }
+            resultWriter.writeData(outputA.c_str(), outputA.length(), alnDbr.getDbKey(i), thread_idx);
+            resultWriter.writeData(outputB.c_str(), outputB.length(), alnDbr.getDbKey(i+1), thread_idx);
+        }
+    }
+
+    resultWriter.close();
+    // clean up
+    delete mapping;
+    alnDbr.close();
+    if (sameDB == false) {
+        delete tDbr;
+    }
+
+    return 0;
+}
+