diff --git a/src/util/pairaln.cpp b/src/util/pairaln.cpp index 3e877875c..508f55366 100644 --- a/src/util/pairaln.cpp +++ b/src/util/pairaln.cpp @@ -9,110 +9,106 @@ #include "NcbiTaxonomy.h" #include "MappingReader.h" -#define ZSTD_STATIC_LINKING_ONLY -#include - #include #ifdef OPENMP #include #endif + +// need for sorting the results +static bool compareByTaxId(const Matcher::result_t &first, const Matcher::result_t &second) { + return (first.dbOrfStartPos < second.dbOrfStartPos); +} + int pairaln(int argc, const char **argv, const Command& command) { Parameters &par = Parameters::getInstance(); par.parseParameters(argc, argv, command, true, 0, 0); - const bool sameDB = par.db1.compare(par.db2) == 0 ? true : false; - const bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP); - std::string db2NoIndexName = PrefilteringIndexReader::dbPathWithoutIndex(par.db2); MappingReader* mapping = new MappingReader(db2NoIndexName); - const int dbaccessMode = (DBReader::USE_INDEX | DBReader::USE_DATA); - IndexReader qDbr(par.db1, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); - IndexReader *tDbr; - if (sameDB) { - tDbr = &qDbr; - } else { - tDbr = new IndexReader(par.db2, par.threads, IndexReader::SRC_SEQUENCES, (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0, dbaccessMode); - } DBReader alnDbr(par.db3.c_str(), par.db3Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); alnDbr.open(DBReader::NOSORT); - if(alnDbr.getSize() % 2 != 0){ - Debug(Debug::ERROR) << "Alignment database does not seem to be paired\n"; - EXIT(EXIT_FAILURE); - } - size_t localThreads = 1; -#ifdef OPENMP - localThreads = std::max(std::min((size_t)par.threads, alnDbr.getSize()), (size_t)1); -#endif - DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), localThreads, par.compressed, alnDbr.getDbtype()); + DBWriter resultWriter(par.db4.c_str(), par.db4Index.c_str(), 1, par.compressed, alnDbr.getDbtype()); resultWriter.open(); Debug::Progress progress(alnDbr.getSize()); -#pragma omp parallel num_threads(localThreads) - { - unsigned int thread_idx = 0; -#ifdef OPENMP - thread_idx = static_cast(omp_get_thread_num()); -#endif + do { + int thread_idx = 0; char buffer[1024 + 32768*4]; - std::string result; - result.reserve(1024 * 1024); - std::vector resultA; - resultA.reserve(100000); - std::vector resultB; - resultB.reserve(100000); - std::unordered_map findPair; - std::string outputA; - outputA.reserve(100000); - std::string outputB; - outputB.reserve(100000); -#pragma omp for schedule(static, 2) - for (size_t i = 0; i < alnDbr.getSize(); i+=2) { - progress.updateProgress(); - progress.updateProgress(); - resultA.clear(); - resultB.clear(); - outputA.clear(); - outputB.clear(); - Matcher::readAlignmentResults(resultA, alnDbr.getData(i, thread_idx), true); - Matcher::readAlignmentResults(resultB, alnDbr.getData(i+1, thread_idx), true); + std::vector result; + result.reserve(100000); + std::unordered_map findPair; + std::string output; + output.reserve(100000); - for (size_t resIdx = 0; resIdx < resultA.size(); ++resIdx) { - unsigned int taxon = mapping->lookup(resultA[resIdx].dbKey); - if(findPair.find(taxon) == findPair.end()){ - findPair.insert({taxon, &resultA[resIdx]}); + unsigned int referenceDbKey = 0; + size_t id = alnDbr.getId(referenceDbKey); + if (id == UINT_MAX) { + break; + } + char* data = alnDbr.getData(id, thread_idx); + Matcher::readAlignmentResults(result, data, true); + for (size_t resIdx = 0; resIdx < result.size(); ++resIdx) { + unsigned int taxon = mapping->lookup(result[resIdx].dbKey); + if (findPair.find(taxon) == findPair.end()) { + findPair.emplace(taxon, 1); + } + } + + // find intersection between all proteins + for (size_t i = 0; i < alnDbr.getSize(); i++) { + unsigned int dbKey = alnDbr.getDbKey(i); + if (dbKey == referenceDbKey) { + continue; + } + result.clear(); + Matcher::readAlignmentResults(result, alnDbr.getData(i, thread_idx), true); + // find pairs + for (size_t resIdx = 0; resIdx < result.size(); ++resIdx) { + unsigned int taxon = mapping->lookup(result[resIdx].dbKey); + if (findPair.find(taxon) != findPair.end()) { + findPair[taxon]++; } } + } + + for (size_t i = 0; i < alnDbr.getSize(); i++) { + progress.updateProgress(); + result.clear(); + output.clear(); + Matcher::readAlignmentResults(result, alnDbr.getData(i, thread_idx), true); // find pairs - for (size_t resIdx = 0; resIdx < resultB.size(); ++resIdx) { - unsigned int taxon = mapping->lookup(resultB[resIdx].dbKey); + for (size_t resIdx = 0; resIdx < result.size(); ++resIdx) { + unsigned int taxon = mapping->lookup(result[resIdx].dbKey); + // we don't want to introduce a new field, reuse existing unused field here + result[resIdx].dbOrfStartPos = taxon; + } + // stable sort is required to assure that best hit is first per taxon + std::stable_sort(result.begin(), result.end(), compareByTaxId); + unsigned int prevTaxon = UINT_MAX; + for (size_t resIdx = 0; resIdx < result.size(); ++resIdx) { + unsigned int taxon = result[resIdx].dbOrfStartPos; // found pair! - if(findPair.find(taxon) != findPair.end()) { - bool hasBacktrace = (resultB[resIdx].backtrace.size() > 0); - size_t len = Matcher::resultToBuffer(buffer, *findPair[taxon], hasBacktrace, false, false); - outputA.append(buffer, len); - len = Matcher::resultToBuffer(buffer, resultB[resIdx], hasBacktrace, false, false); - outputB.append(buffer, len); - findPair.erase (taxon); + if (taxon != prevTaxon + && findPair.find(taxon) != findPair.end() + && findPair[taxon] == alnDbr.getSize()) { + bool hasBacktrace = result[resIdx].backtrace.size() > 0; + size_t len = Matcher::resultToBuffer(buffer, result[resIdx], hasBacktrace, false, false); + output.append(buffer, len); } + prevTaxon = taxon; } - resultWriter.writeData(outputA.c_str(), outputA.length(), alnDbr.getDbKey(i), thread_idx); - resultWriter.writeData(outputB.c_str(), outputB.length(), alnDbr.getDbKey(i+1), thread_idx); - findPair.clear(); + resultWriter.writeData(output.c_str(), output.length(), alnDbr.getDbKey(i), thread_idx); } - } + } while(false); resultWriter.close(); // clean up delete mapping; alnDbr.close(); - if (sameDB == false) { - delete tDbr; - } - - return 0; + return EXIT_SUCCESS; }