From d5717e82128f5604b821133749b919299d15248c Mon Sep 17 00:00:00 2001 From: Milot Mirdita Date: Wed, 10 Feb 2021 20:20:41 +0100 Subject: [PATCH] Add CDD to databases downloader #410 --- data/workflow/databases.sh | 15 ++++++++++++++- src/workflow/Databases.cpp | 7 +++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/data/workflow/databases.sh b/data/workflow/databases.sh index 4665ec61a..cbe2bb7f2 100644 --- a/data/workflow/databases.sh +++ b/data/workflow/databases.sh @@ -193,6 +193,15 @@ case "${SELECTION}" in fi INPUT_TYPE="eggNOG" ;; + "CDD") + if notExists "${TMP_PATH}/msa.msa.gz"; then + downloadFile "https://ftp.ncbi.nih.gov/pub/mmdb/cdd/cdd.info" "${TMP_PATH}/version" + downloadFile "https://ftp.ncbi.nih.gov/pub/mmdb/cdd/fasta.tar.gz" "${TMP_PATH}/msa.tar.gz" + fi + INPUT_TYPE="FASTA_MSA" + FASTA_MSA_SED='s|\.FASTA||g' + FASTA_MSA_MSA2PROFILE_PAR="--skip-query" + ;; "Resfinder") if notExists "${TMP_PATH}/download.done"; then downloadFile "https://api.bitbucket.org/2.0/repositories/genomicepidemiology/resfinder_db/commit/master?fields=hash,date" "${TMP_PATH}/version" @@ -302,9 +311,13 @@ case "${INPUT_TYPE}" in # shellcheck disable=SC2086 "${MMSEQS}" tar2db "${TMP_PATH}/msa.tar.gz" "${TMP_PATH}/msa" --output-dbtype 11 ${THREADS_PAR} \ || fail "tar2db died" + if [ -n "${FASTA_MSA_SED}" ]; then + sed "${FASTA_MSA_SED}" "${TMP_PATH}/msa.lookup" > "${TMP_PATH}/msa.lookup_tmp" + mv -f "${TMP_PATH}/msa.lookup_tmp" "${TMP_PATH}/msa.lookup" + fi rm -f "${TMP_PATH}/msa.tar.gz" # shellcheck disable=SC2086 - "${MMSEQS}" msa2profile "${TMP_PATH}/msa" "${OUTDB}" --match-mode 1 --match-ratio 0.5 ${THREADS_PAR} \ + "${MMSEQS}" msa2profile "${TMP_PATH}/msa" "${OUTDB}" --match-mode 1 --match-ratio 0.5 ${FASTA_MSA_MSA2PROFILE_PAR} ${THREADS_PAR} \ || fail "msa2profile died" if [ -n "${REMOVE_TMP}" ]; then # shellcheck disable=SC2086 diff --git a/src/workflow/Databases.cpp b/src/workflow/Databases.cpp index beaeed786..d8fd16e35 100644 --- a/src/workflow/Databases.cpp +++ b/src/workflow/Databases.cpp @@ -123,6 +123,13 @@ std::vector downloads = {{ "https://xfam.wordpress.com/2020/06/30/a-new-pfam-b-is-released", false, Parameters::DBTYPE_HMM_PROFILE, databases_sh, databases_sh_len, { } +}, { + "CDD", + "Conserved Domain Database is a protein annotation resource consisting of well-annotated MSAs for ancient domains and full-length proteins.", + "Lu et al: CDD/SPARCLE: the conserved domain database in 2020. Nucleic Acids Res 48(D1), D265–D268 (2020)", + "https://www.ncbi.nlm.nih.gov/Structure/cdd/cdd.shtml", + false, Parameters::DBTYPE_HMM_PROFILE, databases_sh, databases_sh_len, + { } }, { "eggNOG", "eggNOG is a hierarchical, functionally and phylogenetically annotated orthology resource",