ENH: add action to build custom Diamond database (#103)

* ENH: Build Diamond DB from fasta action * Added parameter compatibility * Linting corrections * Added parameters and optional input to build_diamond_db * Added example and citations for build_diamond_db * Added compatibility with optional input * Taxonomy typed run enabled * Add test for run with taxon information * Update name action name to build_custom_diamond_db * Refactor TaxonomyNCBI to NCBITaxonomy * remove file entry from citations * remove pdb from eggnog._methods * eggnog._method.py: moving relative imports to end of section * eggnog.tests.test_method.py: reorganize imports * refactor sequences to seqs * Refactoring seqs and method name. further updates * Refactor taxonomy_data to taxonomy * plugin_setup.py: Update `seqs` input description. * plugin_setup.py: update to `taxonomy` input_description * Another update to description on in the function registration * Set default threads to 1 * Another update to parameter description in the function registration * extend command inside * write in/out paths directly in comand * Move the db related code to its own files. New files for dbs New files for dbs in eggnog. Further adjustments. * Adjust paths from imports since change of db related code * set --log always to true * Update q2_moshpit/plugin_setup.py Co-authored-by: Michal Ziemski <mziemski@ethz.ch> --------- Co-authored-by: Michal Ziemski <mziemski@ethz.ch>
bokulich-lab · Jan 15, 2024 · 492dc37 · 492dc37
1 parent f31d81a
commit 492dc37
Show file tree

Hide file tree

Showing 9 changed files with 247 additions and 15 deletions.
diff --git a/q2_moshpit/_examples.py b/q2_moshpit/_examples.py
@@ -0,0 +1,25 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+url = \
+    'https://scop.berkeley.edu/downloads/scopeseq-2.07/astral-scopedom-seqres'
+'-gd-sel-gs-bib-40-2.07.fa'
+
+
+def diamond_makedb(use):
+    fasta_input = use.init_artifact_from_url('sequences', url)
+
+    _ = use.action(
+        use.UsageAction('moshpit', 'build_custom_diamond_db'),
+        use.UsageInputs(
+            seqs=fasta_input,
+        ),
+        use.UsageOutputNames(
+            diamond_db='diamond_db',
+        )
+    )
diff --git a/q2_moshpit/_utils.py b/q2_moshpit/_utils.py
@@ -30,7 +30,7 @@
 }
 
 
-def run_command(cmd, env=None, verbose=True, pipe=False):
+def run_command(cmd, env=None, verbose=True, pipe=False, **kwargs):
     if verbose:
         print("Running external command line application(s). This may print "
               "messages to stdout and/or stderr.")
@@ -47,9 +47,9 @@ def run_command(cmd, env=None, verbose=True, pipe=False):
         return result
 
     if env:
-        subprocess.run(cmd, env=env, check=True)
+        subprocess.run(cmd, env=env, check=True, **kwargs)
     else:
-        subprocess.run(cmd, check=True)
+        subprocess.run(cmd, check=True, **kwargs)
 
 
 def _construct_param(arg_name):

diff --git a/q2_moshpit/citations.bib b/q2_moshpit/citations.bib
@@ -103,3 +103,23 @@ @article{hyatt_prodigal_2010
 	pages = {119},
 }
 
+@article{buchfink_sensitive_2021,
+	title = {Sensitive protein alignments at tree-of-life scale using {DIAMOND}},
+	volume = {18},
+	copyright = {2021 The Author(s)},
+	issn = {1548-7105},
+	url = {https://www.nature.com/articles/s41592-021-01101-x},
+	doi = {10.1038/s41592-021-01101-x},
+	abstract = {We are at the beginning of a genomic revolution in which all known species are planned to be sequenced. Accessing such data for comparative analyses is crucial in this new age of data-driven biology. Here, we introduce an improved version of DIAMOND that greatly exceeds previous search performances and harnesses supercomputing to perform tree-of-life scale protein alignments in hours, while matching the sensitivity of the gold standard BLASTP.},
+	language = {en},
+	number = {4},
+	urldate = {2023-12-01},
+	journal = {Nature Methods},
+	author = {Buchfink, Benjamin and Reuter, Klaus and Drost, Hajk-Georg},
+	month = apr,
+	year = {2021},
+	note = {Number: 4},
+	publisher = {{Nature Publishing Group}},
+	keywords = {Computational biology and bioinformatics, Genome informatics, Genomic analysis, Sequencing, Software},
+	pages = {366--368},
+}
diff --git a/q2_moshpit/eggnog/__init__.py b/q2_moshpit/eggnog/__init__.py
@@ -5,12 +5,11 @@
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
-
-
 from ._method import eggnog_diamond_search, eggnog_annotate
-from ._dbs import fetch_eggnog_db, fetch_diamond_db
+from ._dbs import fetch_eggnog_db, build_custom_diamond_db, fetch_diamond_db
+
 
 __all__ = [
     'eggnog_diamond_search', 'eggnog_annotate', 'fetch_eggnog_db',
-    'fetch_diamond_db'
+    'fetch_diamond_db', 'build_custom_diamond_db'
 ]
diff --git a/q2_moshpit/eggnog/_dbs.py b/q2_moshpit/eggnog/_dbs.py
@@ -6,10 +6,12 @@
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
 import os
-from .._utils import run_command, colorify
+from q2_types.feature_data import ProteinSequencesDirectoryFormat
 from q2_types_genomics.reference_db import (
-    EggnogRefDirFmt, DiamondDatabaseDirFmt
+    EggnogRefDirFmt, DiamondDatabaseDirFmt, NCBITaxonomyDirFmt
 )
+from .._utils import run_command, _process_common_input_params, colorify
+from ._utils import _parse_build_diamond_db_params
 
 
 def fetch_eggnog_db() -> EggnogRefDirFmt:
@@ -37,6 +39,54 @@ def fetch_eggnog_db() -> EggnogRefDirFmt:
     return eggnog_db
 
 
+def build_custom_diamond_db(
+        seqs: ProteinSequencesDirectoryFormat,
+        taxonomy: NCBITaxonomyDirFmt = None,
+        threads: int = 1,
+        file_buffer_size: int = 67108864,
+        ignore_warnings: bool = False,
+        no_parse_seqids: bool = False
+        ) -> DiamondDatabaseDirFmt:
+    '''
+    Builds diamond database from protein reference database file in FASTA
+    format.
+    '''
+    # Process input parameters
+    kwargs = {}
+    for key, value in locals().items():
+        if key not in ["seqs", "taxonomy", "kwargs"]:
+            kwargs[key] = value
+
+    # Add paths to taxonomy data if provided
+    if taxonomy is not None:
+        kwargs["taxonmap"] = os.path.join(
+            str(taxonomy), "prot.accession2taxid.gz"
+        )
+        kwargs["taxonnodes"] = os.path.join(str(taxonomy), "nodes.dmp")
+        kwargs["taxonnames"] = os.path.join(str(taxonomy), "names.dmp")
+
+    # Filter out all kwargs that are falsy (except 0 and 0.0)
+    parsed_args = _process_common_input_params(
+        processing_func=_parse_build_diamond_db_params, params=kwargs
+    )
+
+    # Instantiate output object
+    diamond_db = DiamondDatabaseDirFmt()
+
+    # Run diamond makedb
+    cmd = [
+        "diamond", "makedb",
+        "--verbose", "--log",
+        "--in", f"{os.path.join(str(seqs), 'protein-sequences.fasta')}",
+        "--db", f"{os.path.join(str(diamond_db), 'ref_db.dmnd')}",
+        *parsed_args
+    ]
+    run_command(cmd)
+
+    # Return output artifact
+    return diamond_db
+
+
 def fetch_diamond_db() -> DiamondDatabaseDirFmt:
     """
     Downloads diamond reference database using the

diff --git a/q2_moshpit/eggnog/_utils.py b/q2_moshpit/eggnog/_utils.py
@@ -0,0 +1,28 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2022, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+from typing import List
+
+
+def _parse_build_diamond_db_params(arg_key, arg_val) -> List[str]:
+    """Creates a list with argument and its value to be consumed by
+    the `diamond makedb` command.
+
+    Args:
+        arg_key (str): Argument name.
+        arg_val: Argument value.
+    Returns:
+        [converted_arg, arg_value]: List containing a prepared command line
+            parameter and, optionally, its value.
+    """
+    # Change "_" in arg_key for "-"
+    arg_key = arg_key.replace("_", "-")
+
+    if isinstance(arg_val, bool):
+        return [f"--{arg_key}"]
+    else:
+        return [f"--{arg_key}", str(arg_val)]
diff --git a/q2_moshpit/eggnog/tests/test_dbs.py b/q2_moshpit/eggnog/tests/test_dbs.py
@@ -8,7 +8,9 @@
 import os
 from unittest.mock import patch, call
 from qiime2.plugin.testing import TestPluginBase
-from .._dbs import fetch_eggnog_db, fetch_diamond_db
+from q2_types.feature_data import ProteinSequencesDirectoryFormat
+from q2_types_genomics.reference_db import NCBITaxonomyDirFmt
+from .._dbs import fetch_eggnog_db, build_custom_diamond_db, fetch_diamond_db
 
 
 class TestFetchDB(TestPluginBase):
@@ -27,6 +29,71 @@ def test_fetch_eggnog_db(self, subp_run):
         ]
         subp_run.assert_called_once_with(cmd, check=True)
 
+
+class TestBuildDiamondDB(TestPluginBase):
+    package = 'q2_moshpit.eggnog.tests'
+
+    @patch("subprocess.run")
+    def test_build_custom_diamond_db_simple(self, subp_run):
+        # Instantiate input
+        sequences = ProteinSequencesDirectoryFormat()
+
+        # Call function. Patching will make sure nothing is
+        # actually ran
+        diamond_db = build_custom_diamond_db(sequences)
+
+        # Paths to inputs and outputs
+        path_in = os.path.join(str(sequences), "protein-sequences.fasta")
+        path_out = os.path.join(str(diamond_db), "ref_db.dmnd")
+
+        # Check that command was called in the expected way
+        cmd = [
+            "diamond", "makedb",
+            "--verbose", "--log",
+            "--in", f"{path_in}",
+            "--db", f"{path_out}",
+            "--threads", "1",
+            '--file-buffer-size', '67108864'
+        ]
+
+        # Check that commands is ran as expected
+        subp_run.assert_called_once_with(cmd, check=True)
+
+    @patch("subprocess.run")
+    def test_build_custom_diamond_db_with_taxonomy(self, subp_run):
+        # Instantiate input
+        sequences = ProteinSequencesDirectoryFormat()
+        taxonomy_data = NCBITaxonomyDirFmt()
+
+        # Call function. Patching will make sure nothing is
+        # actually ran
+        diamond_db = build_custom_diamond_db(sequences, taxonomy_data)
+
+        # Paths to inputs and outputs
+        path_in = os.path.join(str(sequences), "protein-sequences.fasta")
+        path_tax_map = os.path.join(
+            str(taxonomy_data), "prot.accession2taxid.gz"
+            )
+        path_tax_nodes = os.path.join(str(taxonomy_data), "nodes.dmp")
+        path_tax_names = os.path.join(str(taxonomy_data), "names.dmp")
+        path_out = os.path.join(str(diamond_db), "ref_db.dmnd")
+
+        # Check that command was called in the expected way
+        cmd = [
+            "diamond", "makedb",
+            "--verbose", "--log",
+            "--in", f"{path_in}",
+            "--db", f"{path_out}",
+            "--threads", "1",
+            '--file-buffer-size', '67108864',
+            "--taxonmap", f"{path_tax_map}",
+            "--taxonnodes", f"{path_tax_nodes}",
+            "--taxonnames", f"{path_tax_names}",
+        ]
+
+        # Check that commands is ran as expected
+        subp_run.assert_called_once_with(cmd, check=True)
+
     @patch("subprocess.run")
     def test_fetch_diamond_db(self, subp_run):
         # Call function. Patching will make sure nothing is

diff --git a/q2_moshpit/eggnog/tests/test_method.py b/q2_moshpit/eggnog/tests/test_method.py
@@ -5,14 +5,15 @@
 #
 # The full license is in the file LICENSE, distributed with this software.
 # ----------------------------------------------------------------------------
+import qiime2
 import pandas as pd
 import pandas.testing as pdt
-import qiime2
 from qiime2.plugin.testing import TestPluginBase
 from q2_types_genomics.feature_data import MAGSequencesDirFmt
 from .._method import eggnog_diamond_search, eggnog_annotate
 from q2_types_genomics.reference_db import (
-    DiamondDatabaseDirFmt, EggnogRefDirFmt)
+    DiamondDatabaseDirFmt, EggnogRefDirFmt
+)
 from q2_types_genomics.per_sample_data import ContigSequencesDirFmt
 from q2_types_genomics.genome_data import SeedOrthologDirFmt, OrthologFileFmt
 

diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py
@@ -8,7 +8,9 @@
 import importlib
 
 from q2_types.distance_matrix import DistanceMatrix
-from q2_types.feature_data import FeatureData, Sequence, Taxonomy
+from q2_types.feature_data import (
+    FeatureData, Sequence, Taxonomy, ProteinSequence
+)
 from q2_types.feature_table import FeatureTable, Frequency, PresenceAbsence
 from q2_types.per_sample_sequences import (
     SequencesWithQuality, PairedEndSequencesWithQuality
@@ -18,7 +20,7 @@
 from qiime2.core.type import Bool, Range, Int, Str, Float, List, Choices
 from qiime2.core.type import (Properties, TypeMap)
 from qiime2.plugin import (Plugin, Citations)
-
+import q2_moshpit._examples as ex
 import q2_moshpit
 from q2_types_genomics.feature_data import NOG, MAG
 from q2_types_genomics.genome_data import (
@@ -31,7 +33,9 @@
 from q2_types_genomics.kraken2._type import BrackenDB
 from q2_types_genomics.per_sample_data import MAGs, Contigs
 from q2_types_genomics.per_sample_data._type import AlignmentMap
-from q2_types_genomics.reference_db import ReferenceDB, Diamond, Eggnog
+from q2_types_genomics.reference_db import (
+    ReferenceDB, Diamond, Eggnog, NCBITaxonomy
+)
 
 citations = Citations.load('citations.bib', package='q2_moshpit')
 
@@ -485,6 +489,44 @@
                 'analyses.'
 )
 
+plugin.methods.register_function(
+    function=q2_moshpit.eggnog.build_custom_diamond_db,
+    inputs={
+        'seqs': FeatureData[ProteinSequence],
+        'taxonomy': ReferenceDB[NCBITaxonomy],
+    },
+    input_descriptions={
+        'seqs': "Protein reference database.",
+        'taxonomy': "Reference taxonomy, "
+                    "needed to provide taxonomy features."
+    },
+    outputs=[('diamond_db', ReferenceDB[Diamond])],
+    output_descriptions={
+        'diamond_db': "DIAMOND database."
+    },
+    parameters={
+        "threads": Int % Range(1, None),
+        "file_buffer_size": Int % Range(1, None),
+        "ignore_warnings": Bool,
+        "no_parse_seqids": Bool
+    },
+    parameter_descriptions={
+        "threads": "Number of CPU threads.",
+        "file_buffer_size": "File buffer size in bytes.",
+        "ignore_warnings": "Ignore warnings.",
+        "no_parse_seqids": "Print raw seqids without parsing."
+    },
+    name="Create a DIAMOND formatted reference database from a FASTA input "
+         "file.",
+    description="Creates an artifact containing a binary DIAMOND database "
+                "file (ref_db.dmnd) from a protein reference database "
+                "file in FASTA format.",
+    citations=[citations["buchfink_sensitive_2021"]],
+    examples={
+        "Minimum working example": ex.diamond_makedb
+    }
+)
+
 plugin.methods.register_function(
     function=q2_moshpit.eggnog.fetch_eggnog_db,
     inputs={},