Skip to content

Commit

Permalink
Merge commit '97d4c6cfb57bb7f0994015580579f31a18aaf9c5'
Browse files Browse the repository at this point in the history
  • Loading branch information
milot-mirdita committed Aug 2, 2024
2 parents bc212bc + 97d4c6c commit 04876ca
Show file tree
Hide file tree
Showing 20 changed files with 293 additions and 54 deletions.
1 change: 1 addition & 0 deletions lib/mmseqs/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ if (NATIVE_ARCH AND (MMSEQS_ARCH STREQUAL ""))
set(MMSEQS_ARCH "-march=native")
endif ()
endif ()
set(MMSEQS_ARCH ${MMSEQS_ARCH} CACHE INTERNAL "")

if (NOT (MMSEQS_ARCH STREQUAL ""))
set(MMSEQS_CXX_FLAGS "${MMSEQS_CXX_FLAGS} ${MMSEQS_ARCH}")
Expand Down
2 changes: 1 addition & 1 deletion lib/mmseqs/azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:
- job: build_macos
displayName: macOS
pool:
vmImage: 'macos-11'
vmImage: 'macos-12'
steps:
- checkout: self
submodules: true
Expand Down
17 changes: 13 additions & 4 deletions lib/mmseqs/data/workflow/createtaxdb.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,22 @@ downloadFile() {
ARIA)
FILENAME=$(basename "${OUTPUT}")
DIR=$(dirname "${OUTPUT}")
aria2c --max-connection-per-server="$ARIA_NUM_CONN" --allow-overwrite=true -o "$FILENAME" -d "$DIR" "$URL" && return 0
if aria2c -c --max-connection-per-server="$ARIA_NUM_CONN" --allow-overwrite=true -o "${FILENAME}.aria2" -d "$DIR" "$URL"; then
mv -f -- "${OUTPUT}.aria2" "${OUTPUT}"
return 0
fi
;;
CURL)
curl -o "$OUTPUT" "$URL" && return 0
if curl -L -C - -o "${OUTPUT}.curl" "$URL"; then
mv -f -- "${OUTPUT}.curl" "${OUTPUT}"
return 0
fi
;;
WGET)
wget -O "$OUTPUT" "$URL" && return 0
if wget -O "${OUTPUT}.wget" -c "$URL"; then
mv -f -- "${OUTPUT}.wget" "${OUTPUT}"
return 0
fi
;;
esac
done
Expand All @@ -59,7 +68,7 @@ if { [ "${DBMODE}" = "1" ] && notExists "${TAXDBNAME}_taxonomy"; } || { [ "${DBM
# Download NCBI taxon information
if notExists "${TMP_PATH}/ncbi_download.complete"; then
echo "Download taxdump.tar.gz"
downloadFile "https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz" "${TMP_PATH}/taxdump.tar.gz"
downloadFile "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz" "${TMP_PATH}/taxdump.tar.gz"
tar -C "${TMP_PATH}" -xzf "${TMP_PATH}/taxdump.tar.gz" names.dmp nodes.dmp merged.dmp delnodes.dmp
touch "${TMP_PATH}/ncbi_download.complete"
rm -f "${TMP_PATH}/taxdump.tar.gz"
Expand Down
23 changes: 16 additions & 7 deletions lib/mmseqs/data/workflow/databases.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,22 @@ downloadFile() {
ARIA)
FILENAME=$(basename "${OUTPUT}")
DIR=$(dirname "${OUTPUT}")
aria2c --max-connection-per-server="$ARIA_NUM_CONN" --allow-overwrite=true -o "$FILENAME" -d "$DIR" "$URL" && return 0
if aria2c -c --max-connection-per-server="$ARIA_NUM_CONN" --allow-overwrite=true -o "${FILENAME}.aria2" -d "$DIR" "$URL"; then
mv -f -- "${OUTPUT}.aria2" "${OUTPUT}"
return 0
fi
;;
CURL)
curl -L -o "$OUTPUT" "$URL" && return 0
if curl -L -C - -o "${OUTPUT}.curl" "$URL"; then
mv -f -- "${OUTPUT}.curl" "${OUTPUT}"
return 0
fi
;;
WGET)
wget -O "$OUTPUT" "$URL" && return 0
if wget -O "${OUTPUT}.wget" -c "$URL"; then
mv -f -- "${OUTPUT}.wget" "${OUTPUT}"
return 0
fi
;;
esac
done
Expand Down Expand Up @@ -118,9 +127,9 @@ case "${SELECTION}" in
if notExists "${TMP_PATH}/nr.gz"; then
date "+%s" > "${TMP_PATH}/version"
downloadFile "https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz" "${TMP_PATH}/nr.gz"
downloadFile "https://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz" "${TMP_PATH}/prot.accession2taxid.gz"
downloadFile "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz" "${TMP_PATH}/prot.accession2taxid.gz"
gunzip "${TMP_PATH}/prot.accession2taxid.gz"
downloadFile "https://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/pdb.accession2taxid.gz" "${TMP_PATH}/pdb.accession2taxid.gz"
downloadFile "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/pdb.accession2taxid.gz" "${TMP_PATH}/pdb.accession2taxid.gz"
gunzip "${TMP_PATH}/pdb.accession2taxid.gz"
fi
push_back "${TMP_PATH}/nr.gz"
Expand Down Expand Up @@ -212,8 +221,8 @@ case "${SELECTION}" in
;;
"CDD")
if notExists "${TMP_PATH}/msa.msa.gz"; then
downloadFile "https://ftp.ncbi.nih.gov/pub/mmdb/cdd/cdd.info" "${TMP_PATH}/version"
downloadFile "https://ftp.ncbi.nih.gov/pub/mmdb/cdd/fasta.tar.gz" "${TMP_PATH}/msa.tar.gz"
downloadFile "https://ftp.ncbi.nlm.nih.gov/pub/mmdb/cdd/cdd.info" "${TMP_PATH}/version"
downloadFile "https://ftp.ncbi.nlm.nih.gov/pub/mmdb/cdd/fasta.tar.gz" "${TMP_PATH}/msa.tar.gz"
fi
INPUT_TYPE="FASTA_MSA"
SED_FIX_LOOKUP='s|\.FASTA||g'
Expand Down
6 changes: 5 additions & 1 deletion lib/mmseqs/data/workflow/taxpercontig.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,11 @@ if [ -n "${ORF_FILTER}" ]; then
fi

if notExists "${TMP_PATH}/orfs_aln.list"; then
awk '$3 > 1 { print $1 }' "${TMP_PATH}/orfs_aln.index" > "${TMP_PATH}/orfs_aln.list"
# shellcheck disable=SC2086
"$MMSEQS" recoverlongestorf "${ORFS_DB}" "${TMP_PATH}/orfs_aln" "${TMP_PATH}/orfs_aln_recovered.list" ${THREADS_PAR} \
|| fail "recoverlongestorf died"
awk '$3 > 1 { print $1 }' "${TMP_PATH}/orfs_aln.index" > "${TMP_PATH}/orfs_aln_remain.list"
cat "${TMP_PATH}/orfs_aln_recovered.list" "${TMP_PATH}/orfs_aln_remain.list" > "${TMP_PATH}/orfs_aln.list"
fi

if notExists "${TMP_PATH}/orfs_filter.dbtype"; then
Expand Down
32 changes: 22 additions & 10 deletions lib/mmseqs/data/workflow/tsv2exprofiledb.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,25 +17,37 @@ OUT="$2"
[ -d "${OUT}.tsv" ] && echo "${OUT} is a directory!" && exit 1;

if notExists "${OUT}_h.dbtype"; then
"$MMSEQS" tsv2db "${IN}_h.tsv" "${OUT}_h" --output-dbtype 12 ${VERBOSITY}
MMSEQS_FORCE_MERGE=1 "$MMSEQS" tsv2db "${IN}_h.tsv" "${OUT}_h" --output-dbtype 12 ${VERBOSITY}
fi

if notExists "${OUT}.dbtype"; then
"$MMSEQS" tsv2db "${IN}.tsv" "${OUT}_tmp" --output-dbtype 0 ${VERBOSITY}
MMSEQS_FOCE_MERGE=1 "$MMSEQS" compress "${OUT}_tmp" "${OUT}" ${VERBOSITY}
"$MMSEQS" rmdb "${OUT}_tmp" ${VERBOSITY}
if [ -n "${COMPRESSED}" ]; then
"$MMSEQS" tsv2db "${IN}.tsv" "${OUT}_tmp" --output-dbtype 0 ${VERBOSITY}
MMSEQS_FORCE_MERGE=1 "$MMSEQS" compress "${OUT}_tmp" "${OUT}" ${VERBOSITY}
"$MMSEQS" rmdb "${OUT}_tmp" ${VERBOSITY}
else
MMSEQS_FORCE_MERGE=1 "$MMSEQS" tsv2db "${IN}.tsv" "${OUT}" --output-dbtype 0 ${VERBOSITY}
fi
fi

if notExists "${OUT}_seq.dbtype"; then
"$MMSEQS" tsv2db "${IN}_seq.tsv" "${OUT}_seq_tmp" --output-dbtype 0 ${VERBOSITY}
MMSEQS_FOCE_MERGE=1 "$MMSEQS" compress "${OUT}_seq_tmp" "${OUT}_seq" ${VERBOSITY}
"$MMSEQS" rmdb "${OUT}_seq_tmp" ${VERBOSITY}
if [ -n "${COMPRESSED}" ]; then
"$MMSEQS" tsv2db "${IN}_seq.tsv" "${OUT}_seq_tmp" --output-dbtype 0 ${VERBOSITY}
MMSEQS_FORCE_MERGE=1 "$MMSEQS" compress "${OUT}_seq_tmp" "${OUT}_seq" ${VERBOSITY}
"$MMSEQS" rmdb "${OUT}_seq_tmp" ${VERBOSITY}
else
"$MMSEQS" tsv2db "${IN}_seq.tsv" "${OUT}_seq" --output-dbtype 0 ${VERBOSITY}
fi
fi

if notExists "${OUT}_aln.dbtype"; then
"$MMSEQS" tsv2db "${IN}_aln.tsv" "${OUT}_aln_tmp" --output-dbtype 5 ${VERBOSITY}
MMSEQS_FOCE_MERGE=1 "$MMSEQS" compress "${OUT}_aln_tmp" "${OUT}_aln" ${VERBOSITY}
"$MMSEQS" rmdb "${OUT}_aln_tmp" ${VERBOSITY}
if [ -n "${COMPRESSED}" ]; then
"$MMSEQS" tsv2db "${IN}_aln.tsv" "${OUT}_aln_tmp" --output-dbtype 5 ${VERBOSITY}
MMSEQS_FORCE_MERGE=1 "$MMSEQS" compress "${OUT}_aln_tmp" "${OUT}_aln" ${VERBOSITY}
"$MMSEQS" rmdb "${OUT}_aln_tmp" ${VERBOSITY}
else
MMSEQS_FORCE_MERGE=1 "$MMSEQS" tsv2db "${IN}_aln.tsv" "${OUT}_aln" --output-dbtype 5 ${VERBOSITY}
fi
fi

if notExists "${OUT}_seq_h.dbtype"; then
Expand Down
7 changes: 4 additions & 3 deletions lib/mmseqs/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,9 @@ else ()
endif ()

target_link_libraries(mmseqs-framework tinyexpr ${ZSTD_LIBRARIES} microtar)
if (CYGWIN)
target_link_libraries(mmseqs-framework nedmalloc)
endif ()
# if (CYGWIN)
# target_link_libraries(mmseqs-framework nedmalloc)
# endif ()

if (EMSCRIPTEN)
target_compile_definitions(mmseqs-framework PUBLIC -DHAVE_ZLIB=1 -DHAVE_BZLIB=1)
Expand Down Expand Up @@ -222,6 +222,7 @@ if (OPENMP_FOUND)
if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
target_link_libraries(mmseqs-framework ${OpenMP_CXX_LIBRARIES})
endif()
target_include_directories(mmseqs-framework PUBLIC ${OpenMP_CXX_INCLUDE_DIRS})
append_target_property(mmseqs-framework COMPILE_FLAGS ${OpenMP_CXX_FLAGS})
append_target_property(mmseqs-framework LINK_FLAGS ${OpenMP_CXX_FLAGS})
elseif (REQUIRE_OPENMP)
Expand Down
1 change: 1 addition & 0 deletions lib/mmseqs/src/CommandDeclarations.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ extern int ungappedprefilter(int argc, const char **argv, const Command& command
extern int gappedprefilter(int argc, const char **argv, const Command& command);
extern int unpackdb(int argc, const char **argv, const Command& command);
extern int rbh(int argc, const char **argv, const Command& command);
extern int recoverlongestorf(int argc, const char **argv, const Command& command);
extern int result2flat(int argc, const char **argv, const Command& command);
extern int result2msa(int argc, const char **argv, const Command& command);
extern int result2dnamsa(int argc, const char **argv, const Command& command);
Expand Down
10 changes: 9 additions & 1 deletion lib/mmseqs/src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,14 @@ std::vector<Command> baseCommands = {
"Eli Levy Karin <eli.levy.karin@gmail.com> ",
"<i:contigsSequenceDB> <i:extractedOrfsHeadersDB> <o:orfsAlignedToContigDB>",
CITATION_MMSEQS2, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}},
{"recoverlongestorf", recoverlongestorf, &par.onlythreads, COMMAND_EXPERT,
"Recover longest ORF for taxonomy annotation after elimination",
NULL,
"Sung-eun Jang",
"<i:orfDB> <i:resultDB> <o:tsvFile>",
CITATION_MMSEQS2, {{"orfDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb},
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb},
{"tsvFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
{"reverseseq", reverseseq, &par.reverseseq, COMMAND_SEQUENCE,
"Reverse (without complement) sequences",
NULL,
Expand Down Expand Up @@ -1142,7 +1150,7 @@ std::vector<Command> baseCommands = {
"<i:hhsuiteHHMDB> <o:profileDB>",
CITATION_MMSEQS2,{{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}},

{"tsv2exprofiledb", tsv2exprofiledb, &par.onlyverbosity, COMMAND_PROFILE_PROFILE,
{"tsv2exprofiledb", tsv2exprofiledb, &par.verbandcompression, COMMAND_PROFILE_PROFILE,
"Create a expandable profile db from TSV files",
NULL,
"Milot Mirdita <milot@mirdita.de>",
Expand Down
2 changes: 1 addition & 1 deletion lib/mmseqs/src/alignment/Alignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ void Alignment::run(const std::string &outDB, const std::string &outDBIndex, con

// recompute alignment boundaries (without changing evalue)
const bool isIdentity = (queryDbKey == swResults[result].dbKey && (includeIdentity || sameQTDB)) ? true : false;
Matcher::result_t res = realigner->getSWResult(&dbSeq, INT_MAX, false, realignCov, covThr, FLT_MAX, realignSwMode, seqIdMode, isIdentity);
Matcher::result_t res = realigner->getSWResult(&dbSeq, INT_MAX, false, covMode, realignCov, FLT_MAX, realignSwMode, seqIdMode, isIdentity);

const bool covOK = Util::hasCoverage(realignCov, covMode, res.qcov, res.dbcov);
if (covOK == true || isIdentity) {
Expand Down
16 changes: 8 additions & 8 deletions lib/mmseqs/src/commons/MultiParam.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,21 +23,21 @@ class NuclAA {
T first;
T second;

NuclAA<T>(const NuclAA<T> &value) {
NuclAA(const NuclAA<T> &value) {
this->first = value.first;
this->second = value.second;
}

static const T max;

NuclAA<T>() {}
NuclAA() {}

NuclAA<T>(T first) {
NuclAA(T first) {
this->first = first;
this->second = first;
}

NuclAA<T>(T first, T second) {
NuclAA(T first, T second) {
this->first = first;
this->second = second;
}
Expand Down Expand Up @@ -88,21 +88,21 @@ class SeqProf {
T first;
T second;

SeqProf<T>(const SeqProf<T> &value) {
SeqProf(const SeqProf<T> &value) {
this->first = value.first;
this->second = value.second;
}

static const T max;

SeqProf<T>() {}
SeqProf() {}

SeqProf<T>(T first) {
SeqProf(T first) {
this->first = first;
this->second = first;
}

SeqProf<T>(T first, T second) {
SeqProf(T first, T second) {
this->first = first;
this->second = second;
}
Expand Down
4 changes: 2 additions & 2 deletions lib/mmseqs/src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ Parameters::Parameters():
// taxonomyreport
PARAM_REPORT_MODE(PARAM_REPORT_MODE_ID, "--report-mode", "Report mode", "Taxonomy report mode 0: Kraken 1: Krona", typeid(int), (void *) &reportMode, "^[0-1]{1}$"),
// createtaxdb
PARAM_NCBI_TAX_DUMP(PARAM_NCBI_TAX_DUMP_ID, "--ncbi-tax-dump", "NCBI tax dump directory", "NCBI tax dump directory. The tax dump can be downloaded here \"ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz\"", typeid(std::string), (void *) &ncbiTaxDump, ""),
PARAM_NCBI_TAX_DUMP(PARAM_NCBI_TAX_DUMP_ID, "--ncbi-tax-dump", "NCBI tax dump directory", "NCBI tax dump directory. The tax dump can be downloaded here \"ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz\"", typeid(std::string), (void *) &ncbiTaxDump, ""),
PARAM_TAX_MAPPING_FILE(PARAM_TAX_MAPPING_FILE_ID, "--tax-mapping-file", "Taxonomy mapping file", "File to map sequence identifier to taxonomical identifier", typeid(std::string), (void *) &taxMappingFile, ""),
PARAM_TAX_MAPPING_MODE(PARAM_TAX_MAPPING_MODE_ID, "--tax-mapping-mode", "Taxonomy mapping mode", "Map taxonomy based on sequence database 0: .lookup file 1: .source file", typeid(int), (void *) &taxMappingMode, "^[0-1]{1}$"),
PARAM_TAX_DB_MODE(PARAM_TAX_DB_MODE_ID, "--tax-db-mode", "Taxonomy db mode", "Create taxonomy database as: 0: .dmp flat files (human readable) 1: binary dump (faster readin)", typeid(int), (void *) &taxDbMode, "^[0-1]{1}$"),
Expand Down Expand Up @@ -1870,7 +1870,7 @@ void Parameters::parseParameters(int argc, const char *pargv[], const Command &c
EXIT(EXIT_FAILURE);
}
filenames.emplace_back(posix);
delete posix;
delete[] posix;
}
#else
filenames.emplace_back(pargv[argIdx]);
Expand Down
2 changes: 1 addition & 1 deletion lib/mmseqs/src/prefiltering/CacheFriendlyOperations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ CacheFriendlyOperations<BINSIZE>::CacheFriendlyOperations(size_t maxElement, siz
}

template<unsigned int BINSIZE>
CacheFriendlyOperations<BINSIZE>::~CacheFriendlyOperations<BINSIZE>(){
CacheFriendlyOperations<BINSIZE>::~CacheFriendlyOperations(){
delete[] duplicateBitArray;
delete[] binDataFrame;
delete[] tmpElementBuffer;
Expand Down
1 change: 1 addition & 0 deletions lib/mmseqs/src/util/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ set(util_source_files
util/profile2pssm.cpp
util/profile2neff.cpp
util/profile2seq.cpp
util/recoverlongestorf.cpp
util/result2dnamsa.cpp
util/result2flat.cpp
util/result2msa.cpp
Expand Down
Loading

0 comments on commit 04876ca

Please sign in to comment.