diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h index de280f046..c8c479c6c 100644 --- a/src/CommandDeclarations.h +++ b/src/CommandDeclarations.h @@ -86,6 +86,7 @@ extern int prefilter(int argc, const char **argv, const Command& command); extern int prefixid(int argc, const char **argv, const Command& command); extern int profile2cs(int argc, const char **argv, const Command& command); extern int profile2pssm(int argc, const char **argv, const Command& command); +extern int profile2neff(int argc, const char **argv, const Command& command); extern int profile2consensus(int argc, const char **argv, const Command& command); extern int profile2repseq(int argc, const char **argv, const Command& command); extern int proteinaln2nucl(int argc, const char **argv, const Command& command); diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp index cef90dd8a..ef427dfe9 100644 --- a/src/MMseqsBase.cpp +++ b/src/MMseqsBase.cpp @@ -1087,6 +1087,13 @@ std::vector baseCommands = { " ", CITATION_MMSEQS2, {{"profileDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::profileDb }, {"pssmFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::genericDb }}}, + {"profile2neff", profile2neff, &par.profile2neff, COMMAND_PROFILE, + "Convert a profile DB to a tab-separated list of Neff scores", + NULL, + "Johannes Spies ", + " ", + CITATION_MMSEQS2, {{"profileDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::profileDb }, + {"neffFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::genericDb }}}, {"profile2consensus", profile2consensus, &par.profile2seq, COMMAND_PROFILE, "Extract consensus sequence DB from a profile DB", NULL, diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index 97eaa86aa..0f2792a30 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -1111,6 +1111,7 @@ class Parameters { std::vector renamedbkeys; std::vector createtaxdb; std::vector profile2pssm; + std::vector profile2neff; std::vector profile2seq; std::vector besthitbyset; std::vector combinepvalbyset; diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index 94c9c5c75..c323980c6 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -42,6 +42,7 @@ set(util_source_files util/pairaln.cpp util/prefixid.cpp util/profile2pssm.cpp + util/profile2neff.cpp util/profile2seq.cpp util/result2dnamsa.cpp util/result2flat.cpp diff --git a/src/util/profile2neff.cpp b/src/util/profile2neff.cpp new file mode 100644 index 000000000..1955ed8df --- /dev/null +++ b/src/util/profile2neff.cpp @@ -0,0 +1,79 @@ +#include "Parameters.h" +#include "DBReader.h" +#include "DBWriter.h" +#include "Util.h" +#include "Debug.h" +#include "Sequence.h" +#include "SubstitutionMatrix.h" +#include "itoa.h" + +#ifdef OPENMP +#include +#endif + +int profile2neff(int argc, const char **argv, const Command &command) { + Parameters &par = Parameters::getInstance(); + par.parseParameters(argc, argv, command, true, 0, MMseqsParameter::COMMAND_PROFILE); + + DBReader reader(par.db1.c_str(), par.db1Index.c_str(), par.threads, DBReader::USE_INDEX | DBReader::USE_DATA); + reader.open(DBReader::LINEAR_ACCCESS); + + const bool isDbOutput = par.dbOut; + const bool shouldCompress = isDbOutput == true && par.compressed == true; + const int dbType = isDbOutput == true ? Parameters::DBTYPE_GENERIC_DB : Parameters::DBTYPE_OMIT_FILE; + DBWriter writer(par.db2.c_str(), par.db2Index.c_str(), par.threads, shouldCompress, dbType); + writer.open(); + + SubstitutionMatrix subMat(par.scoringMatrixFile.values.aminoacid().c_str(), 2.0f, 0.0); + + size_t entries = reader.getSize(); + Debug::Progress progress(entries); +#pragma omp parallel + { + unsigned int thread_idx = 0; +#ifdef OPENMP + thread_idx = static_cast(omp_get_thread_num()); +#endif + + Sequence seq(par.maxSeqLen, Parameters::DBTYPE_HMM_PROFILE, &subMat, 0, false, par.compBiasCorrection, false); + + char buffer[256]; + std::string result; + result.reserve(10 * 1024 * 1024); + +#pragma omp for schedule(dynamic, 1000) + for (size_t i = 0; i < entries; ++i) { + progress.updateProgress(); + + unsigned int key = reader.getDbKey(i); + seq.mapSequence(i, key, reader.getData(i, thread_idx), reader.getSeqLen(i)); + + if (isDbOutput == false) { + result.append("Neff_Ms of sequence "); + Itoa::u32toa_sse2(key, buffer); + result.append(buffer); + result.push_back('\n'); + } + + // add a line containing Neff_M scores of `seq` + for(int j = 0; j < seq.L; ++j) { + // TODO(johannes): Here, the precision for the floating point conversion is not specified and set to 2 + // digits. Consider setting the precision using a commandline parameter. + snprintf(buffer, sizeof(buffer), "%0.4f", seq.neffM[j]); + result.append(buffer); + if(j < (seq.L - 1)) result.push_back('\t'); + } + result.push_back('\n'); + + writer.writeData(result.c_str(), result.length(), key, thread_idx, isDbOutput); + result.clear(); + } + } + writer.close(isDbOutput == false); + if (isDbOutput == false) { + remove(par.db2Index.c_str()); + } + reader.close(); + + return EXIT_SUCCESS; +}