Skip to content

Commit

Permalink
ENH: add action to build custom Diamond database (#103)
Browse files Browse the repository at this point in the history
* ENH: Build Diamond DB from fasta action

* Added parameter compatibility

* Linting corrections

* Added parameters and optional input to build_diamond_db

* Added example and citations for build_diamond_db

* Added compatibility with optional input

* Taxonomy typed run enabled

* Add test for run with taxon information

* Update name action name to build_custom_diamond_db

* Refactor TaxonomyNCBI to NCBITaxonomy

* remove file entry from citations

* remove pdb from eggnog._methods

* eggnog._method.py: moving relative imports to end of section

* eggnog.tests.test_method.py: reorganize imports

* refactor sequences to seqs

* Refactoring seqs and method name. further updates

* Refactor taxonomy_data to taxonomy

* plugin_setup.py: Update `seqs` input description.

* plugin_setup.py: update to `taxonomy` input_description

* Another update to description on in the function registration

* Set default threads to 1

* Another update to parameter description in the function registration

* extend command inside

* write in/out paths directly in comand

* Move the db related code to its own files.

New files for dbs

New files for dbs in eggnog. Further adjustments.

* Adjust paths from imports since change of db related code

* set --log always to true

* Update q2_moshpit/plugin_setup.py

Co-authored-by: Michal Ziemski <mziemski@ethz.ch>

---------

Co-authored-by: Michal Ziemski <mziemski@ethz.ch>
  • Loading branch information
Sann5 and misialq authored Jan 15, 2024
1 parent f31d81a commit 492dc37
Show file tree
Hide file tree
Showing 9 changed files with 247 additions and 15 deletions.
25 changes: 25 additions & 0 deletions q2_moshpit/_examples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2016-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

url = \
'https://scop.berkeley.edu/downloads/scopeseq-2.07/astral-scopedom-seqres'
'-gd-sel-gs-bib-40-2.07.fa'


def diamond_makedb(use):
fasta_input = use.init_artifact_from_url('sequences', url)

_ = use.action(
use.UsageAction('moshpit', 'build_custom_diamond_db'),
use.UsageInputs(
seqs=fasta_input,
),
use.UsageOutputNames(
diamond_db='diamond_db',
)
)
6 changes: 3 additions & 3 deletions q2_moshpit/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
}


def run_command(cmd, env=None, verbose=True, pipe=False):
def run_command(cmd, env=None, verbose=True, pipe=False, **kwargs):
if verbose:
print("Running external command line application(s). This may print "
"messages to stdout and/or stderr.")
Expand All @@ -47,9 +47,9 @@ def run_command(cmd, env=None, verbose=True, pipe=False):
return result

if env:
subprocess.run(cmd, env=env, check=True)
subprocess.run(cmd, env=env, check=True, **kwargs)
else:
subprocess.run(cmd, check=True)
subprocess.run(cmd, check=True, **kwargs)


def _construct_param(arg_name):
Expand Down
20 changes: 20 additions & 0 deletions q2_moshpit/citations.bib
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,23 @@ @article{hyatt_prodigal_2010
pages = {119},
}

@article{buchfink_sensitive_2021,
title = {Sensitive protein alignments at tree-of-life scale using {DIAMOND}},
volume = {18},
copyright = {2021 The Author(s)},
issn = {1548-7105},
url = {https://www.nature.com/articles/s41592-021-01101-x},
doi = {10.1038/s41592-021-01101-x},
abstract = {We are at the beginning of a genomic revolution in which all known species are planned to be sequenced. Accessing such data for comparative analyses is crucial in this new age of data-driven biology. Here, we introduce an improved version of DIAMOND that greatly exceeds previous search performances and harnesses supercomputing to perform tree-of-life scale protein alignments in hours, while matching the sensitivity of the gold standard BLASTP.},
language = {en},
number = {4},
urldate = {2023-12-01},
journal = {Nature Methods},
author = {Buchfink, Benjamin and Reuter, Klaus and Drost, Hajk-Georg},
month = apr,
year = {2021},
note = {Number: 4},
publisher = {{Nature Publishing Group}},
keywords = {Computational biology and bioinformatics, Genome informatics, Genomic analysis, Sequencing, Software},
pages = {366--368},
}
7 changes: 3 additions & 4 deletions q2_moshpit/eggnog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------


from ._method import eggnog_diamond_search, eggnog_annotate
from ._dbs import fetch_eggnog_db, fetch_diamond_db
from ._dbs import fetch_eggnog_db, build_custom_diamond_db, fetch_diamond_db


__all__ = [
'eggnog_diamond_search', 'eggnog_annotate', 'fetch_eggnog_db',
'fetch_diamond_db'
'fetch_diamond_db', 'build_custom_diamond_db'
]
54 changes: 52 additions & 2 deletions q2_moshpit/eggnog/_dbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import os
from .._utils import run_command, colorify
from q2_types.feature_data import ProteinSequencesDirectoryFormat
from q2_types_genomics.reference_db import (
EggnogRefDirFmt, DiamondDatabaseDirFmt
EggnogRefDirFmt, DiamondDatabaseDirFmt, NCBITaxonomyDirFmt
)
from .._utils import run_command, _process_common_input_params, colorify
from ._utils import _parse_build_diamond_db_params


def fetch_eggnog_db() -> EggnogRefDirFmt:
Expand Down Expand Up @@ -37,6 +39,54 @@ def fetch_eggnog_db() -> EggnogRefDirFmt:
return eggnog_db


def build_custom_diamond_db(
seqs: ProteinSequencesDirectoryFormat,
taxonomy: NCBITaxonomyDirFmt = None,
threads: int = 1,
file_buffer_size: int = 67108864,
ignore_warnings: bool = False,
no_parse_seqids: bool = False
) -> DiamondDatabaseDirFmt:
'''
Builds diamond database from protein reference database file in FASTA
format.
'''
# Process input parameters
kwargs = {}
for key, value in locals().items():
if key not in ["seqs", "taxonomy", "kwargs"]:
kwargs[key] = value

# Add paths to taxonomy data if provided
if taxonomy is not None:
kwargs["taxonmap"] = os.path.join(
str(taxonomy), "prot.accession2taxid.gz"
)
kwargs["taxonnodes"] = os.path.join(str(taxonomy), "nodes.dmp")
kwargs["taxonnames"] = os.path.join(str(taxonomy), "names.dmp")

# Filter out all kwargs that are falsy (except 0 and 0.0)
parsed_args = _process_common_input_params(
processing_func=_parse_build_diamond_db_params, params=kwargs
)

# Instantiate output object
diamond_db = DiamondDatabaseDirFmt()

# Run diamond makedb
cmd = [
"diamond", "makedb",
"--verbose", "--log",
"--in", f"{os.path.join(str(seqs), 'protein-sequences.fasta')}",
"--db", f"{os.path.join(str(diamond_db), 'ref_db.dmnd')}",
*parsed_args
]
run_command(cmd)

# Return output artifact
return diamond_db


def fetch_diamond_db() -> DiamondDatabaseDirFmt:
"""
Downloads diamond reference database using the
Expand Down
28 changes: 28 additions & 0 deletions q2_moshpit/eggnog/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2022, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
from typing import List


def _parse_build_diamond_db_params(arg_key, arg_val) -> List[str]:
"""Creates a list with argument and its value to be consumed by
the `diamond makedb` command.
Args:
arg_key (str): Argument name.
arg_val: Argument value.
Returns:
[converted_arg, arg_value]: List containing a prepared command line
parameter and, optionally, its value.
"""
# Change "_" in arg_key for "-"
arg_key = arg_key.replace("_", "-")

if isinstance(arg_val, bool):
return [f"--{arg_key}"]
else:
return [f"--{arg_key}", str(arg_val)]
69 changes: 68 additions & 1 deletion q2_moshpit/eggnog/tests/test_dbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
import os
from unittest.mock import patch, call
from qiime2.plugin.testing import TestPluginBase
from .._dbs import fetch_eggnog_db, fetch_diamond_db
from q2_types.feature_data import ProteinSequencesDirectoryFormat
from q2_types_genomics.reference_db import NCBITaxonomyDirFmt
from .._dbs import fetch_eggnog_db, build_custom_diamond_db, fetch_diamond_db


class TestFetchDB(TestPluginBase):
Expand All @@ -27,6 +29,71 @@ def test_fetch_eggnog_db(self, subp_run):
]
subp_run.assert_called_once_with(cmd, check=True)


class TestBuildDiamondDB(TestPluginBase):
package = 'q2_moshpit.eggnog.tests'

@patch("subprocess.run")
def test_build_custom_diamond_db_simple(self, subp_run):
# Instantiate input
sequences = ProteinSequencesDirectoryFormat()

# Call function. Patching will make sure nothing is
# actually ran
diamond_db = build_custom_diamond_db(sequences)

# Paths to inputs and outputs
path_in = os.path.join(str(sequences), "protein-sequences.fasta")
path_out = os.path.join(str(diamond_db), "ref_db.dmnd")

# Check that command was called in the expected way
cmd = [
"diamond", "makedb",
"--verbose", "--log",
"--in", f"{path_in}",
"--db", f"{path_out}",
"--threads", "1",
'--file-buffer-size', '67108864'
]

# Check that commands is ran as expected
subp_run.assert_called_once_with(cmd, check=True)

@patch("subprocess.run")
def test_build_custom_diamond_db_with_taxonomy(self, subp_run):
# Instantiate input
sequences = ProteinSequencesDirectoryFormat()
taxonomy_data = NCBITaxonomyDirFmt()

# Call function. Patching will make sure nothing is
# actually ran
diamond_db = build_custom_diamond_db(sequences, taxonomy_data)

# Paths to inputs and outputs
path_in = os.path.join(str(sequences), "protein-sequences.fasta")
path_tax_map = os.path.join(
str(taxonomy_data), "prot.accession2taxid.gz"
)
path_tax_nodes = os.path.join(str(taxonomy_data), "nodes.dmp")
path_tax_names = os.path.join(str(taxonomy_data), "names.dmp")
path_out = os.path.join(str(diamond_db), "ref_db.dmnd")

# Check that command was called in the expected way
cmd = [
"diamond", "makedb",
"--verbose", "--log",
"--in", f"{path_in}",
"--db", f"{path_out}",
"--threads", "1",
'--file-buffer-size', '67108864',
"--taxonmap", f"{path_tax_map}",
"--taxonnodes", f"{path_tax_nodes}",
"--taxonnames", f"{path_tax_names}",
]

# Check that commands is ran as expected
subp_run.assert_called_once_with(cmd, check=True)

@patch("subprocess.run")
def test_fetch_diamond_db(self, subp_run):
# Call function. Patching will make sure nothing is
Expand Down
5 changes: 3 additions & 2 deletions q2_moshpit/eggnog/tests/test_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
import qiime2
import pandas as pd
import pandas.testing as pdt
import qiime2
from qiime2.plugin.testing import TestPluginBase
from q2_types_genomics.feature_data import MAGSequencesDirFmt
from .._method import eggnog_diamond_search, eggnog_annotate
from q2_types_genomics.reference_db import (
DiamondDatabaseDirFmt, EggnogRefDirFmt)
DiamondDatabaseDirFmt, EggnogRefDirFmt
)
from q2_types_genomics.per_sample_data import ContigSequencesDirFmt
from q2_types_genomics.genome_data import SeedOrthologDirFmt, OrthologFileFmt

Expand Down
48 changes: 45 additions & 3 deletions q2_moshpit/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
import importlib

from q2_types.distance_matrix import DistanceMatrix
from q2_types.feature_data import FeatureData, Sequence, Taxonomy
from q2_types.feature_data import (
FeatureData, Sequence, Taxonomy, ProteinSequence
)
from q2_types.feature_table import FeatureTable, Frequency, PresenceAbsence
from q2_types.per_sample_sequences import (
SequencesWithQuality, PairedEndSequencesWithQuality
Expand All @@ -18,7 +20,7 @@
from qiime2.core.type import Bool, Range, Int, Str, Float, List, Choices
from qiime2.core.type import (Properties, TypeMap)
from qiime2.plugin import (Plugin, Citations)

import q2_moshpit._examples as ex
import q2_moshpit
from q2_types_genomics.feature_data import NOG, MAG
from q2_types_genomics.genome_data import (
Expand All @@ -31,7 +33,9 @@
from q2_types_genomics.kraken2._type import BrackenDB
from q2_types_genomics.per_sample_data import MAGs, Contigs
from q2_types_genomics.per_sample_data._type import AlignmentMap
from q2_types_genomics.reference_db import ReferenceDB, Diamond, Eggnog
from q2_types_genomics.reference_db import (
ReferenceDB, Diamond, Eggnog, NCBITaxonomy
)

citations = Citations.load('citations.bib', package='q2_moshpit')

Expand Down Expand Up @@ -485,6 +489,44 @@
'analyses.'
)

plugin.methods.register_function(
function=q2_moshpit.eggnog.build_custom_diamond_db,
inputs={
'seqs': FeatureData[ProteinSequence],
'taxonomy': ReferenceDB[NCBITaxonomy],
},
input_descriptions={
'seqs': "Protein reference database.",
'taxonomy': "Reference taxonomy, "
"needed to provide taxonomy features."
},
outputs=[('diamond_db', ReferenceDB[Diamond])],
output_descriptions={
'diamond_db': "DIAMOND database."
},
parameters={
"threads": Int % Range(1, None),
"file_buffer_size": Int % Range(1, None),
"ignore_warnings": Bool,
"no_parse_seqids": Bool
},
parameter_descriptions={
"threads": "Number of CPU threads.",
"file_buffer_size": "File buffer size in bytes.",
"ignore_warnings": "Ignore warnings.",
"no_parse_seqids": "Print raw seqids without parsing."
},
name="Create a DIAMOND formatted reference database from a FASTA input "
"file.",
description="Creates an artifact containing a binary DIAMOND database "
"file (ref_db.dmnd) from a protein reference database "
"file in FASTA format.",
citations=[citations["buchfink_sensitive_2021"]],
examples={
"Minimum working example": ex.diamond_makedb
}
)

plugin.methods.register_function(
function=q2_moshpit.eggnog.fetch_eggnog_db,
inputs={},
Expand Down

0 comments on commit 492dc37

Please sign in to comment.