Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

recover longest orf after fragment elimination #832

Merged
merged 2 commits into from
May 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion data/workflow/taxpercontig.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,11 @@ if [ -n "${ORF_FILTER}" ]; then
fi

if notExists "${TMP_PATH}/orfs_aln.list"; then
awk '$3 > 1 { print $1 }' "${TMP_PATH}/orfs_aln.index" > "${TMP_PATH}/orfs_aln.list"
# shellcheck disable=SC2086
"$MMSEQS" recoverlongestorf "${ORFS_DB}" "${TMP_PATH}/orfs_aln" "${TMP_PATH}/orfs_aln_recovered.list" ${THREADS_PAR} \
|| fail "recoverlongestorf died"
awk '$3 > 1 { print $1 }' "${TMP_PATH}/orfs_aln.index" > "${TMP_PATH}/orfs_aln_remain.list"
cat "${TMP_PATH}/orfs_aln_recovered.list" "${TMP_PATH}/orfs_aln_remain.list" > "${TMP_PATH}/orfs_aln.list"
fi

if notExists "${TMP_PATH}/orfs_filter.dbtype"; then
Expand Down
1 change: 1 addition & 0 deletions src/CommandDeclarations.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ extern int ungappedprefilter(int argc, const char **argv, const Command& command
extern int gappedprefilter(int argc, const char **argv, const Command& command);
extern int unpackdb(int argc, const char **argv, const Command& command);
extern int rbh(int argc, const char **argv, const Command& command);
extern int recoverlongestorf(int argc, const char **argv, const Command& command);
extern int result2flat(int argc, const char **argv, const Command& command);
extern int result2msa(int argc, const char **argv, const Command& command);
extern int result2dnamsa(int argc, const char **argv, const Command& command);
Expand Down
8 changes: 8 additions & 0 deletions src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,14 @@ std::vector<Command> baseCommands = {
"Eli Levy Karin <eli.levy.karin@gmail.com> ",
"<i:contigsSequenceDB> <i:extractedOrfsHeadersDB> <o:orfsAlignedToContigDB>",
CITATION_MMSEQS2, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}},
{"recoverlongestorf", recoverlongestorf, &par.onlythreads, COMMAND_EXPERT,
"Recover longest ORF for taxonomy annotation after elimination",
NULL,
"Sung-eun Jang",
"<i:orfDB> <i:resultDB> <o:tsvFile>",
CITATION_MMSEQS2, {{"orfDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb},
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb},
{"tsvFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
{"reverseseq", reverseseq, &par.reverseseq, COMMAND_SEQUENCE,
"Reverse (without complement) sequences",
NULL,
Expand Down
1 change: 1 addition & 0 deletions src/util/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ set(util_source_files
util/profile2pssm.cpp
util/profile2neff.cpp
util/profile2seq.cpp
util/recoverlongestorf.cpp
util/result2dnamsa.cpp
util/result2flat.cpp
util/result2msa.cpp
Expand Down
148 changes: 148 additions & 0 deletions src/util/recoverlongestorf.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
#include "Parameters.h"
#include "DBReader.h"
#include "DBWriter.h"
#include "Debug.h"
#include "Util.h"
#include "Orf.h"

#include <unordered_map>
#include <unordered_set>

#ifdef OPENMP
#include <omp.h>
#endif

int recoverlongestorf(int argc, const char **argv, const Command &command) {
Parameters &par = Parameters::getInstance();
par.parseParameters(argc, argv, command, true, 0, 0);

DBReader<unsigned int> headerReader(par.hdr1.c_str(), par.hdr1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
headerReader.open(DBReader<unsigned int>::LINEAR_ACCCESS);

DBReader<unsigned int> resultReader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
resultReader.open(DBReader<unsigned int>::LINEAR_ACCCESS);

DBWriter writer(par.db3.c_str(), par.db3Index.c_str(), 1, false, Parameters::DBTYPE_OMIT_FILE);
writer.open();

Debug::Progress progress(resultReader.getSize());
std::unordered_map<unsigned int, std::pair<unsigned int, size_t>> contigToLongest;
#pragma omp parallel
{
unsigned int thread_idx = 0;
#ifdef OPENMP
thread_idx = static_cast<unsigned int>(omp_get_thread_num());

#endif
std::unordered_map<unsigned int, std::pair<unsigned int, size_t>> localContigToLongest;

#pragma omp for schedule(dynamic, 100)
for (size_t id = 0; id < headerReader.getSize(); ++id) {
progress.updateProgress();
unsigned int orfKey = headerReader.getDbKey(id);
char *orfHeader = headerReader.getData(id, thread_idx);
Orf::SequenceLocation orf = Orf::parseOrfHeader(orfHeader);
unsigned int contigKey = orf.id;
size_t orfLen = std::max(orf.from, orf.to) - std::min(orf.from, orf.to) + 1;
std::unordered_map<unsigned int, std::pair<unsigned int, size_t>>::iterator it = localContigToLongest.find(contigKey);
if (it != localContigToLongest.end()) {
std::pair<unsigned int, size_t> orfKeyToLength = it->second;
if (orfLen > orfKeyToLength.second) {
it->second = std::make_pair(orfKey, orfLen);
}
} else {
localContigToLongest.emplace(contigKey, std::make_pair(orfKey, orfLen));
}
}

#pragma omp critical
{
for (auto &entry : localContigToLongest) {
auto &contigKey = entry.first;
auto &localData = entry.second;
auto it = contigToLongest.find(contigKey);
if (it != contigToLongest.end()) {
if (localData.second > it->second.second) {
it->second = localData;
}
} else {
contigToLongest[contigKey] = localData;
}
}
}
}

progress.reset(resultReader.getSize());
std::unordered_set<unsigned int> acceptedContigs;
std::unordered_set<unsigned int> eliminatedContigs;
#pragma omp parallel
{
int thread_idx = 0;
#ifdef OPENMP
thread_idx = omp_get_thread_num();
#endif
std::string resultBuffer;
resultBuffer.reserve(1024 * 1024);

std::unordered_set<unsigned int> localAcceptedContigs;
std::unordered_set<unsigned int> localEliminatedContigs;
#pragma omp for schedule(dynamic, 5)
for (size_t i = 0; i < resultReader.getSize(); ++i) {
progress.updateProgress();

unsigned int key = resultReader.getDbKey(i);
size_t entryLength = resultReader.getEntryLen(i);
if (entryLength > 1) {
size_t id = headerReader.getId(key);
char *orfHeader = headerReader.getData(id, thread_idx);
Orf::SequenceLocation orf = Orf::parseOrfHeader(orfHeader);
unsigned int contigKey = orf.id;
localAcceptedContigs.emplace(contigKey);
}

size_t id = headerReader.getId(key);
char *orfHeader = headerReader.getData(id, thread_idx);
Orf::SequenceLocation orf = Orf::parseOrfHeader(orfHeader);
unsigned int contigKey = orf.id;
localEliminatedContigs.emplace(contigKey);
}

#pragma omp critical
{
acceptedContigs.insert(localAcceptedContigs.begin(), localAcceptedContigs.end());
eliminatedContigs.insert(localEliminatedContigs.begin(), localEliminatedContigs.end());
}
}

for (auto it = eliminatedContigs.begin(); it != eliminatedContigs.end(); ) {
if (acceptedContigs.find(*it) != acceptedContigs.end()) {
it = eliminatedContigs.erase(it);
} else {
++it;
}
}

std::string resultBuffer;
resultBuffer.reserve(1024 * 1024);
for (auto contigIt = eliminatedContigs.begin(); contigIt != eliminatedContigs.end(); ++contigIt) {
unsigned int contigKey = *contigIt;
std::unordered_map<unsigned int, std::pair<unsigned int, size_t>>::iterator it = contigToLongest.find(contigKey);
if (it != contigToLongest.end()) {
unsigned int orfKey = it->second.first;
resultBuffer.append(SSTR(orfKey));
resultBuffer.append(1, '\n');
writer.writeData(resultBuffer.c_str(), resultBuffer.length(), orfKey, 0, false, false);
resultBuffer.clear();
} else {
Debug(Debug::ERROR) << "Missing contig " << contigKey << "\n";
EXIT(EXIT_FAILURE);
}
}

writer.close(true);
FileUtil::remove(writer.getIndexFileName());
headerReader.close();
resultReader.close();

return EXIT_SUCCESS;
}
1 change: 1 addition & 0 deletions src/workflow/Taxonomy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ int taxonomy(int argc, const char **argv, const Command& command) {
cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL);
cmd.addVariable("RUNNER", par.runner.c_str());
cmd.addVariable("THREADS_COMP_PAR", par.createParameterString(par.threadsandcompression).c_str());
cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str());
cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str());

if (searchMode & Parameters::SEARCH_MODE_FLAG_QUERY_TRANSLATED && !(searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_TRANSLATED)) {
Expand Down
Loading