From 5b4c8163d057b93ba1063e5352b56e23835986bc Mon Sep 17 00:00:00 2001 From: "Luna Jang (Sung-eun Jang)" Date: Tue, 21 May 2024 16:00:57 +0900 Subject: [PATCH] Recover longest orf after fragment elimination (#832) * recover longest orf after fragment elimination * delete unused variable --- data/workflow/taxpercontig.sh | 6 +- src/CommandDeclarations.h | 1 + src/MMseqsBase.cpp | 8 ++ src/util/CMakeLists.txt | 1 + src/util/recoverlongestorf.cpp | 148 +++++++++++++++++++++++++++++++++ src/workflow/Taxonomy.cpp | 1 + 6 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 src/util/recoverlongestorf.cpp diff --git a/data/workflow/taxpercontig.sh b/data/workflow/taxpercontig.sh index 299ac3646..f350df220 100644 --- a/data/workflow/taxpercontig.sh +++ b/data/workflow/taxpercontig.sh @@ -45,7 +45,11 @@ if [ -n "${ORF_FILTER}" ]; then fi if notExists "${TMP_PATH}/orfs_aln.list"; then - awk '$3 > 1 { print $1 }' "${TMP_PATH}/orfs_aln.index" > "${TMP_PATH}/orfs_aln.list" + # shellcheck disable=SC2086 + "$MMSEQS" recoverlongestorf "${ORFS_DB}" "${TMP_PATH}/orfs_aln" "${TMP_PATH}/orfs_aln_recovered.list" ${THREADS_PAR} \ + || fail "recoverlongestorf died" + awk '$3 > 1 { print $1 }' "${TMP_PATH}/orfs_aln.index" > "${TMP_PATH}/orfs_aln_remain.list" + cat "${TMP_PATH}/orfs_aln_recovered.list" "${TMP_PATH}/orfs_aln_remain.list" > "${TMP_PATH}/orfs_aln.list" fi if notExists "${TMP_PATH}/orfs_filter.dbtype"; then diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h index e8eac296c..3c4abb4e0 100644 --- a/src/CommandDeclarations.h +++ b/src/CommandDeclarations.h @@ -98,6 +98,7 @@ extern int ungappedprefilter(int argc, const char **argv, const Command& command extern int gappedprefilter(int argc, const char **argv, const Command& command); extern int unpackdb(int argc, const char **argv, const Command& command); extern int rbh(int argc, const char **argv, const Command& command); +extern int recoverlongestorf(int argc, const char **argv, const Command& command); extern int result2flat(int argc, const char **argv, const Command& command); extern int result2msa(int argc, const char **argv, const Command& command); extern int result2dnamsa(int argc, const char **argv, const Command& command); diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index 282e740d1..720aeaccb 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -907,6 +907,14 @@ std::vector baseCommands = { "Eli Levy Karin ", " ", CITATION_MMSEQS2, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}}, + {"recoverlongestorf", recoverlongestorf, &par.onlythreads, COMMAND_EXPERT, + "Recover longest ORF for taxonomy annotation after elimination", + NULL, + "Sung-eun Jang", + " ", + CITATION_MMSEQS2, {{"orfDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb}, + {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb}, + {"tsvFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}}, {"reverseseq", reverseseq, &par.reverseseq, COMMAND_SEQUENCE, "Reverse (without complement) sequences", NULL, diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index 7fa7b0603..c43c3356d 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -47,6 +47,7 @@ set(util_source_files util/profile2pssm.cpp util/profile2neff.cpp util/profile2seq.cpp + util/recoverlongestorf.cpp util/result2dnamsa.cpp util/result2flat.cpp util/result2msa.cpp diff --git a/src/util/recoverlongestorf.cpp b/src/util/recoverlongestorf.cpp new file mode 100644 index 000000000..b25d89476 --- /dev/null +++ b/src/util/recoverlongestorf.cpp @@ -0,0 +1,148 @@ +#include "Parameters.h" +#include "DBReader.h" +#include "DBWriter.h" +#include "Debug.h" +#include "Util.h" +#include "Orf.h" + +#include +#include + +#ifdef OPENMP +#include +#endif + +int recoverlongestorf(int argc, const char **argv, const Command &command) { + Parameters &par = Parameters::getInstance(); + par.parseParameters(argc, argv, command, true, 0, 0); + + DBReader headerReader(par.hdr1.c_str(), par.hdr1Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + headerReader.open(DBReader::LINEAR_ACCCESS); + + DBReader resultReader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader::USE_INDEX|DBReader::USE_DATA); + resultReader.open(DBReader::LINEAR_ACCCESS); + + DBWriter writer(par.db3.c_str(), par.db3Index.c_str(), 1, false, Parameters::DBTYPE_OMIT_FILE); + writer.open(); + + Debug::Progress progress(resultReader.getSize()); + std::unordered_map> contigToLongest; +#pragma omp parallel + { + unsigned int thread_idx = 0; +#ifdef OPENMP + thread_idx = static_cast(omp_get_thread_num()); + +#endif + std::unordered_map> localContigToLongest; + +#pragma omp for schedule(dynamic, 100) + for (size_t id = 0; id < headerReader.getSize(); ++id) { + progress.updateProgress(); + unsigned int orfKey = headerReader.getDbKey(id); + char *orfHeader = headerReader.getData(id, thread_idx); + Orf::SequenceLocation orf = Orf::parseOrfHeader(orfHeader); + unsigned int contigKey = orf.id; + size_t orfLen = std::max(orf.from, orf.to) - std::min(orf.from, orf.to) + 1; + std::unordered_map>::iterator it = localContigToLongest.find(contigKey); + if (it != localContigToLongest.end()) { + std::pair orfKeyToLength = it->second; + if (orfLen > orfKeyToLength.second) { + it->second = std::make_pair(orfKey, orfLen); + } + } else { + localContigToLongest.emplace(contigKey, std::make_pair(orfKey, orfLen)); + } + } + +#pragma omp critical + { + for (auto &entry : localContigToLongest) { + auto &contigKey = entry.first; + auto &localData = entry.second; + auto it = contigToLongest.find(contigKey); + if (it != contigToLongest.end()) { + if (localData.second > it->second.second) { + it->second = localData; + } + } else { + contigToLongest[contigKey] = localData; + } + } + } + } + + progress.reset(resultReader.getSize()); + std::unordered_set acceptedContigs; + std::unordered_set eliminatedContigs; +#pragma omp parallel + { + int thread_idx = 0; +#ifdef OPENMP + thread_idx = omp_get_thread_num(); +#endif + std::string resultBuffer; + resultBuffer.reserve(1024 * 1024); + + std::unordered_set localAcceptedContigs; + std::unordered_set localEliminatedContigs; +#pragma omp for schedule(dynamic, 5) + for (size_t i = 0; i < resultReader.getSize(); ++i) { + progress.updateProgress(); + + unsigned int key = resultReader.getDbKey(i); + size_t entryLength = resultReader.getEntryLen(i); + if (entryLength > 1) { + size_t id = headerReader.getId(key); + char *orfHeader = headerReader.getData(id, thread_idx); + Orf::SequenceLocation orf = Orf::parseOrfHeader(orfHeader); + unsigned int contigKey = orf.id; + localAcceptedContigs.emplace(contigKey); + } + + size_t id = headerReader.getId(key); + char *orfHeader = headerReader.getData(id, thread_idx); + Orf::SequenceLocation orf = Orf::parseOrfHeader(orfHeader); + unsigned int contigKey = orf.id; + localEliminatedContigs.emplace(contigKey); + } + +#pragma omp critical + { + acceptedContigs.insert(localAcceptedContigs.begin(), localAcceptedContigs.end()); + eliminatedContigs.insert(localEliminatedContigs.begin(), localEliminatedContigs.end()); + } + } + + for (auto it = eliminatedContigs.begin(); it != eliminatedContigs.end(); ) { + if (acceptedContigs.find(*it) != acceptedContigs.end()) { + it = eliminatedContigs.erase(it); + } else { + ++it; + } + } + + std::string resultBuffer; + resultBuffer.reserve(1024 * 1024); + for (auto contigIt = eliminatedContigs.begin(); contigIt != eliminatedContigs.end(); ++contigIt) { + unsigned int contigKey = *contigIt; + std::unordered_map>::iterator it = contigToLongest.find(contigKey); + if (it != contigToLongest.end()) { + unsigned int orfKey = it->second.first; + resultBuffer.append(SSTR(orfKey)); + resultBuffer.append(1, '\n'); + writer.writeData(resultBuffer.c_str(), resultBuffer.length(), orfKey, 0, false, false); + resultBuffer.clear(); + } else { + Debug(Debug::ERROR) << "Missing contig " << contigKey << "\n"; + EXIT(EXIT_FAILURE); + } + } + + writer.close(true); + FileUtil::remove(writer.getIndexFileName()); + headerReader.close(); + resultReader.close(); + + return EXIT_SUCCESS; +} diff --git a/src/workflow/Taxonomy.cpp b/src/workflow/Taxonomy.cpp index 4dbd34c73..78bb7606c 100644 --- a/src/workflow/Taxonomy.cpp +++ b/src/workflow/Taxonomy.cpp @@ -95,6 +95,7 @@ int taxonomy(int argc, const char **argv, const Command& command) { cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); cmd.addVariable("RUNNER", par.runner.c_str()); cmd.addVariable("THREADS_COMP_PAR", par.createParameterString(par.threadsandcompression).c_str()); + cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str()); cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str()); if (searchMode & Parameters::SEARCH_MODE_FLAG_QUERY_TRANSLATED && !(searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_TRANSLATED)) {