bokulich-lab · misialq · Jan 15, 2024 · Nov 27, 2023 · Nov 27, 2023 · Nov 27, 2023
diff --git a/q2_moshpit/_examples.py b/q2_moshpit/_examples.py
@@ -0,0 +1,25 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016-2023, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+url = \
+    'https://scop.berkeley.edu/downloads/scopeseq-2.07/astral-scopedom-seqres'
+'-gd-sel-gs-bib-40-2.07.fa'
+
+
+def diamond_makedb(use):
+    fasta_input = use.init_artifact_from_url('sequences', url)
+
+    _ = use.action(
+        use.UsageAction('moshpit', 'build_diamond_db'),
+        use.UsageInputs(
+            sequences=fasta_input,
+        ),
+        use.UsageOutputNames(
+            diamond_db='diamond_db',
+        )
+    )
diff --git a/q2_moshpit/_utils.py b/q2_moshpit/_utils.py
@@ -9,7 +9,7 @@
 from typing import List
 
 
-def run_command(cmd, env=None, verbose=True, pipe=False):
+def run_command(cmd, env=None, verbose=True, pipe=False, **kwargs):
     if verbose:
         print("Running external command line application(s). This may print "
               "messages to stdout and/or stderr.")
@@ -26,9 +26,9 @@ def run_command(cmd, env=None, verbose=True, pipe=False):
         return result
 
     if env:
-        subprocess.run(cmd, env=env, check=True)
+        subprocess.run(cmd, env=env, check=True, **kwargs)
     else:
-        subprocess.run(cmd, check=True)
+        subprocess.run(cmd, check=True, **kwargs)
 
 
 def _construct_param(arg_name):

diff --git a/q2_moshpit/citations.bib b/q2_moshpit/citations.bib
@@ -103,3 +103,24 @@ @article{hyatt_prodigal_2010
 	pages = {119},
 }
 
+@article{buchfink_sensitive_2021,
+	title = {Sensitive protein alignments at tree-of-life scale using {DIAMOND}},
+	volume = {18},
+	copyright = {2021 The Author(s)},
+	issn = {1548-7105},
+	url = {https://www.nature.com/articles/s41592-021-01101-x},
+	doi = {10.1038/s41592-021-01101-x},
+	abstract = {We are at the beginning of a genomic revolution in which all known species are planned to be sequenced. Accessing such data for comparative analyses is crucial in this new age of data-driven biology. Here, we introduce an improved version of DIAMOND that greatly exceeds previous search performances and harnesses supercomputing to perform tree-of-life scale protein alignments in hours, while matching the sensitivity of the gold standard BLASTP.},
+	language = {en},
+	number = {4},
+	urldate = {2023-12-01},
+	journal = {Nature Methods},
+	author = {Buchfink, Benjamin and Reuter, Klaus and Drost, Hajk-Georg},
+	month = apr,
+	year = {2021},
+	note = {Number: 4},
+	publisher = {{Nature Publishing Group}},
+	keywords = {Computational biology and bioinformatics, Genome informatics, Genomic analysis, Sequencing, Software},
+	pages = {366--368},
+	file = {Full Text PDF:/Users/santiago/Zotero/storage/SHFF6JHD/Buchfink et al. - 2021 - Sensitive protein alignments at tree-of-life scale.pdf:application/pdf},
+}
diff --git a/q2_moshpit/eggnog/__init__.py b/q2_moshpit/eggnog/__init__.py
@@ -7,6 +7,12 @@
 # ----------------------------------------------------------------------------
 
 
-from ._method import (eggnog_diamond_search, eggnog_annotate, fetch_eggnog_db)
+from ._method import (
+  eggnog_diamond_search, eggnog_annotate, fetch_eggnog_db,
+  build_custom_diamond_db
+)
 
-__all__ = ['eggnog_diamond_search', 'eggnog_annotate', 'fetch_eggnog_db']
+__all__ = [
+  'eggnog_diamond_search', 'eggnog_annotate', 'fetch_eggnog_db',
+  'build_custom_diamond_db'
+]
diff --git a/q2_moshpit/eggnog/_method.py b/q2_moshpit/eggnog/_method.py
@@ -10,14 +10,20 @@
 import os
 import tempfile
 import qiime2.util
+# import pdb here for debugging purposes
 import pandas as pd
 from typing import Union
 from q2_types_genomics.per_sample_data import ContigSequencesDirFmt
 from q2_types_genomics.genome_data import SeedOrthologDirFmt, OrthologFileFmt
+from .._utils import run_command, _process_common_input_params
+from ._utils import _parse_build_diamond_db_params
 from q2_types_genomics.reference_db import EggnogRefDirFmt
-from q2_types.feature_data import DNAFASTAFormat
-from q2_types_genomics.reference_db import DiamondDatabaseDirFmt
-from .._utils import run_command
+from q2_types.feature_data import (
+    DNAFASTAFormat, ProteinSequencesDirectoryFormat
+)
+from q2_types_genomics.reference_db import (
+    DiamondDatabaseDirFmt, NCBITaxonomyDirFmt
+)
 from q2_types_genomics.feature_data import (
     OrthologAnnotationDirFmt, MAGSequencesDirFmt
 )
@@ -134,6 +140,59 @@ def _annotate_seed_orthologs_runner(seed_ortholog, eggnog_db, sample_label,
     subprocess.run(cmds, check=True)
 
 
+def build_custom_diamond_db(
+        sequences: ProteinSequencesDirectoryFormat,
+        taxonomy_data: NCBITaxonomyDirFmt = None,
+        threads: int = None,
+        verbose: bool = False,
+        log: bool = False,
+        file_buffer_size: int = 67108864,
+        ignore_warnings: bool = False,
+        no_parse_seqids: bool = False
+        ) -> DiamondDatabaseDirFmt:
+    '''
+    Builds diamond database from protein reference database file in FASTA
+    format.
+    '''
+    # Process input parameters
+    kwargs = {}
+    for key, value in locals().items():
+        if key not in ["sequences", "taxonomy_data", "kwargs"]:
+            kwargs[key] = value
+
+    # Add paths to taxonomy data if provided
+    if taxonomy_data is not None:
+        kwargs["taxonmap"] = os.path.join(
+            str(taxonomy_data), "prot.accession2taxid.gz"
+            )
+        kwargs["taxonnodes"] = os.path.join(str(taxonomy_data), "nodes.dmp")
+        kwargs["taxonnames"] = os.path.join(str(taxonomy_data), "names.dmp")
+
+    # Filter out all kwargs that are falsy (except 0 and 0.0)
+    parsed_args = _process_common_input_params(
+        processing_func=_parse_build_diamond_db_params, params=kwargs
+    )
+
+    # Instantiate output object
+    diamond_db = DiamondDatabaseDirFmt()
+
+    # Define path to in/output file
+    path_in = os.path.join(str(sequences), "protein-sequences.fasta")
+    path_out = os.path.join(str(diamond_db), "ref_db.dmnd")
+
+    # Run diamond makedb
+    cmd = [
+        "diamond", "makedb",
+        "--in", f"{path_in}",
+        "--db", f"{path_out}"
+    ]
+    cmd.extend(parsed_args)
+    run_command(cmd)
+
+    # Return output artifact
+    return diamond_db
+
+
 def fetch_eggnog_db() -> EggnogRefDirFmt:
     """
     Downloads eggnog reference database using the

diff --git a/q2_moshpit/eggnog/_utils.py b/q2_moshpit/eggnog/_utils.py
@@ -0,0 +1,28 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2022, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+from typing import List
+
+
+def _parse_build_diamond_db_params(arg_key, arg_val) -> List[str]:
+    """Creates a list with argument and its value to be consumed by
+    the `diamond makedb` command.
+
+    Args:
+        arg_key (str): Argument name.
+        arg_val: Argument value.
+    Returns:
+        [converted_arg, arg_value]: List containing a prepared command line
+            parameter and, optionally, its value.
+    """
+    # Change "_" in arg_key for "-"
+    arg_key = arg_key.replace("_", "-")
+
+    if isinstance(arg_val, bool):
+        return [f"--{arg_key}"]
+    else:
+        return [f"--{arg_key}", str(arg_val)]
diff --git a/q2_moshpit/eggnog/tests/test_method.py b/q2_moshpit/eggnog/tests/test_method.py
@@ -8,16 +8,20 @@
 
 import pandas as pd
 import pandas.testing as pdt
+import os
 from unittest.mock import patch
 import qiime2
 from qiime2.plugin.testing import TestPluginBase
-
 from q2_types_genomics.feature_data import MAGSequencesDirFmt
-from .._method import eggnog_diamond_search, eggnog_annotate, fetch_eggnog_db
+from .._method import (
+  eggnog_diamond_search, eggnog_annotate, fetch_eggnog_db,
+  build_custom_diamond_db
+)
 from q2_types_genomics.reference_db import (
-    DiamondDatabaseDirFmt, EggnogRefDirFmt)
+    DiamondDatabaseDirFmt, EggnogRefDirFmt, NCBITaxonomyDirFmt)
 from q2_types_genomics.per_sample_data import ContigSequencesDirFmt
 from q2_types_genomics.genome_data import SeedOrthologDirFmt, OrthologFileFmt
+from q2_types.feature_data import ProteinSequencesDirectoryFormat
 
 
 class TestDiamond(TestPluginBase):
@@ -87,6 +91,67 @@ def test_small_good_hits(self):
         pdt.assert_frame_equal(df, exp)
 
 
+class TestBuildDiamondDB(TestPluginBase):
+    package = 'q2_moshpit.eggnog.tests'
+
+    @patch("subprocess.run")
+    def test_build_custom_diamond_db_simple(self, subp_run):
+        # Instantiate input
+        sequences = ProteinSequencesDirectoryFormat()
+
+        # Call function. Patching will make sure nothing is
+        # actually ran
+        diamond_db = build_custom_diamond_db(sequences)
+
+        # Paths to inputs and outputs
+        path_in = os.path.join(str(sequences), "protein-sequences.fasta")
+        path_out = os.path.join(str(diamond_db), "ref_db.dmnd")
+
+        # Check that command was called in the expected way
+        cmd = [
+            "diamond", "makedb",
+            "--in", f"{path_in}",
+            "--db", f"{path_out}",
+            '--file-buffer-size', '67108864'
+        ]
+
+        # Check that commands is ran as expected
+        subp_run.assert_called_once_with(cmd, check=True)
+
+    @patch("subprocess.run")
+    def test_build_custom_diamond_db_with_taxonomy(self, subp_run):
+        # Instantiate input
+        sequences = ProteinSequencesDirectoryFormat()
+        taxonomy_data = NCBITaxonomyDirFmt()
+
+        # Call function. Patching will make sure nothing is
+        # actually ran
+        diamond_db = build_custom_diamond_db(sequences, taxonomy_data)
+
+        # Paths to inputs and outputs
+        path_in = os.path.join(str(sequences), "protein-sequences.fasta")
+        path_tax_map = os.path.join(
+            str(taxonomy_data), "prot.accession2taxid.gz"
+            )
+        path_tax_nodes = os.path.join(str(taxonomy_data), "nodes.dmp")
+        path_tax_names = os.path.join(str(taxonomy_data), "names.dmp")
+        path_out = os.path.join(str(diamond_db), "ref_db.dmnd")
+
+        # Check that command was called in the expected way
+        cmd = [
+            "diamond", "makedb",
+            "--in", f"{path_in}",
+            "--db", f"{path_out}",
+            '--file-buffer-size', '67108864',
+            "--taxonmap", f"{path_tax_map}",
+            "--taxonnodes", f"{path_tax_nodes}",
+            "--taxonnames", f"{path_tax_names}",
+        ]
+
+        # Check that commands is ran as expected
+        subp_run.assert_called_once_with(cmd, check=True)
+
+
 class TestFetchDB(TestPluginBase):
     package = 'q2_moshpit.eggnog.tests'
 

diff --git a/q2_moshpit/plugin_setup.py b/q2_moshpit/plugin_setup.py
@@ -8,7 +8,9 @@
 import importlib
 
 from q2_types.distance_matrix import DistanceMatrix
-from q2_types.feature_data import FeatureData, Sequence, Taxonomy
+from q2_types.feature_data import (
+    FeatureData, Sequence, Taxonomy, ProteinSequence
+)
 from q2_types.feature_table import FeatureTable, Frequency, PresenceAbsence
 from q2_types.per_sample_sequences import (
     SequencesWithQuality, PairedEndSequencesWithQuality
@@ -18,7 +20,7 @@
 from qiime2.core.type import Bool, Range, Int, Str, Float, List, Choices
 from qiime2.core.type import (Properties, TypeMap)
 from qiime2.plugin import (Plugin, Citations)
-
+import q2_moshpit._examples as ex
 import q2_moshpit
 from q2_types_genomics.feature_data import NOG, MAG
 from q2_types_genomics.genome_data import (
@@ -31,7 +33,9 @@
 from q2_types_genomics.kraken2._type import BrackenDB
 from q2_types_genomics.per_sample_data import MAGs, Contigs
 from q2_types_genomics.per_sample_data._type import AlignmentMap
-from q2_types_genomics.reference_db import ReferenceDB, Diamond, Eggnog
+from q2_types_genomics.reference_db import (
+    ReferenceDB, Diamond, Eggnog, TaxonomyNCBI
+)
 
 citations = Citations.load('citations.bib', package='q2_moshpit')
 
@@ -404,6 +408,55 @@
                 'analyses.'
 )
 
+plugin.methods.register_function(
+    function=q2_moshpit.eggnog.build_custom_diamond_db,
+    inputs={
+        'sequences': FeatureData[ProteinSequence],
+        'taxonomy_data': ReferenceDB[TaxonomyNCBI],
+    },
+    input_descriptions={
+        'sequences': "Artifact containing protein reference database file "
+                     "in FASTA format.",
+        'taxonomy_data': "Artifact containing taxonomy data. "
+                         "Needed in order to provide taxonomy features. "
+                         "Can be generated through name_of_action."
+                         # TODO: update action name here
+    },
+    outputs=[('diamond_db', ReferenceDB[Diamond])],
+    output_descriptions={
+        'diamond_db': "Artifact containing a binary DIAMOND database file."
+    },
+    parameters={
+        "threads": Int % Range(1, None),
+        "verbose": Bool,
+        "log": Bool,
+        "file_buffer_size": Int % Range(1, None),
+        "ignore_warnings": Bool,
+        "no_parse_seqids": Bool
+    },
+    parameter_descriptions={
+        "threads": "Number of CPU threads. By default, the program will "
+                   "auto-detect and use all available virtual cores on the "
+                   "machine.",
+        "verbose": "Enable more verbose terminal output.",
+        "log": "Enable even more verbose terminal output, which is also "
+               "written to a file named diamond.log is the current working "
+               "directory.",
+        "file_buffer_size": "file buffer size in bytes (default=67108864)",
+        "ignore_warnings": "Ignore warnings",
+        "no_parse_seqids": "Print raw seqids without parsing"
+    },
+    name="Create a DIAMOND formatted reference database from a FASTA input "
+         "file.",
+    description="Creates an artifact containing a binary DIAMOND database "
+                "file (ref_db.dmnd) from a protein reference database "
+                "file in FASTA format.",
+    citations=[citations["buchfink_sensitive_2021"]],
+    examples={
+        "Minimum working example": ex.diamond_makedb
+    }
+)
+
 plugin.methods.register_function(
     function=q2_moshpit.eggnog.fetch_eggnog_db,
     inputs={},