[FEATURE] Profiling for single and paired end

- code - tests - documentation - examples
motu-tool · Nov 5, 2022 · cd85ac7 · cd85ac7
1 parent 54a4706
commit cd85ac7
Show file tree

Hide file tree

Showing 28 changed files with 79,490 additions and 18 deletions.
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,5 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+q2_motus/tests/data/dev.ipynb
diff --git a/.travis.yaml b/.travis.yaml
@@ -0,0 +1,24 @@
+dist: trusty
+sudo: false
+language: python
+before_install:
+  - export MPLBACKEND='Agg'
+  - wget -q https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
+  - export MINICONDA_PREFIX="$HOME/miniconda"
+  - bash miniconda.sh -b -p $MINICONDA_PREFIX
+  - export PATH="$MINICONDA_PREFIX/bin:$PATH"
+  - conda config --set always_yes yes
+  - conda update -q conda
+  - conda info -a
+install:
+  - wget -q https://raw.githubusercontent.com/qiime2/environment-files/master/latest/staging/qiime2-latest-py36-linux-conda.yml
+  - conda env create -q -n test-env --file qiime2-latest-py36-linux-conda.yml
+  - source activate test-env
+  - conda install -q pytest-cov
+  - pip install flake8 coveralls
+  - make install
+script:
+  - make lint
+  - travis_wait make test-cov
+after_success:
+  - coveralls
diff --git a/Makefile b/Makefile
@@ -14,6 +14,9 @@ test-cov: all
 	py.test --cov=q2_motus
 
 install:
+	which motus  || pip install motu-profiler
+	which bwa || git clone https://github.com/lh3/bwa.git || cd bwa || make || cd .. || rm -rf bwa
+	motus downloadDB
 	$(PYTHON) setup.py install
 
 dev: all

diff --git a/README.md b/README.md
@@ -1 +1,42 @@
-# q2-mOTUs
+# q2-mOTUs
+This is a QIIME 2 wrapper for (mOTU-tool)[https://motu-tool.org/] . For details on QIIME 2, see https://qiime2.org. The tool will help you to assign taxonomy to your metagenomic samples
+
+# Requirements
+- QIIME 2 >= 2022.8 (https://qiime2.org/)
+- Git
+# Installation
+## 1. Install QIIME 2
+Follow the instructions on https://docs.qiime2.org/2022.8/install/native/ to install QIIME 2. You will need to install the latest version of QIIME 2 (2022.8 or later).
+## 2. Activate QIIME 2
+Activate the QIIME 2 environment by running the following command:
+```
+conda activate qiime2-2022.8
+```
+## 3. Install mOTU-tool
+```
+git clone https://github.com/motu-tool/q2-mOTUs
+cd q2-mOTUs
+make install
+```
+
+# Usage
+## 1. Import your data to QIIME 2
+Import your metagenomic sequencing data in `.fastq` format (don't forget to preprocess your data) to QIIME2 as a `SampleData` semantic type using manifest file. See examples in `q2_motus/test/data`.
+## 2. Run mOTU-tool
+Whether you have a single sample or multiple samples, you can run mOTU-tool using the following command:
+```
+qiime motus classify \
+    --i-samples paired-end.qza \
+    --o-table paired-end-classified.qza \
+    --p-threads 4
+```
+## 3. Process the results
+Use `qiime2` tools to visualize the results downstream, for example use:
+```
+qiime feature-table summarize \
+    --i-table paired-end-classified.qza \
+    --o-visualization paired-end-classified.qzv
+```
+To get summary of your feature table.
+
+![image](expected_output/table-summary.png)
diff --git a/example_output/paired-end-classified.qza b/example_output/paired-end-classified.qza
diff --git a/example_output/paired-end-classified.qzv b/example_output/paired-end-classified.qzv
diff --git a/example_output/table-summary.png b/example_output/table-summary.png
diff --git a/q2_motus/__init__.py b/q2_motus/__init__.py
@@ -1,7 +1,7 @@
-from ._hello import print_hello
+from ._taxonomy import classify
 from ._version import get_versions
 
 __version__ = get_versions()['version']
 del get_versions
 
-__all__ = ['print_hello']
+__all__ = ['classify']
diff --git a/q2_motus/_hello.py b/q2_motus/_hello.py
diff --git a/q2_motus/_taxonomy.py b/q2_motus/_taxonomy.py
@@ -0,0 +1,147 @@
+import os
+import subprocess
+import tempfile
+from functools import partial
+from typing import List, NamedTuple, Union, Literal
+
+import numpy as np
+import pandas as pd
+from q2_types.per_sample_sequences import (
+    SingleLanePerSamplePairedEndFastqDirFmt,
+    SingleLanePerSampleSingleEndFastqDirFmt)
+
+
+def _run_command(cmd: List[str], verbose: bool = False) -> None:
+    """Run a command in a subprocess.
+
+    Parameters
+    ----------
+    cmd : str
+        The command to run.
+    verbose : bool, optional
+        Whether to print the command before running it, by default False.
+
+    Raises
+    ------
+    subprocess.CalledProcessError
+        If the command returns a non-zero exit status.
+    """
+    if verbose:
+        print(cmd)
+    subprocess.run(cmd, check=True)
+
+
+def preprocess_manifest(data: Union[SingleLanePerSampleSingleEndFastqDirFmt,
+                                    SingleLanePerSamplePairedEndFastqDirFmt]) -> pd.DataFrame:
+
+    """Extracts sample names and filepaths from manifest file."""
+
+    manifest = pd.read_csv(os.path.join(str(data), data.manifest.pathspec),     
+                           header=0, comment='#')
+
+    manifest.filename = manifest.filename.apply(lambda x: os.path.join(str(data), x))
+    id_to_fps = manifest.pivot(index='sample-id', columns='direction', values='filename')
+
+    return id_to_fps
+
+
+def profile_sample(
+    output_directory: str, 
+    threads: int, 
+    min_alen: int,
+    marker_gene_cutoff: int, 
+    mode: Literal["base.coverage", "insert.raw_counts", "insert.scaled_counts"],
+    reference_genomes: bool,
+    ncbi_taxonomy: bool,
+    full_taxonomy: bool,
+    taxonomy_level: Literal["mOTU", "genus", "family", "order", "class", "phylum", "kingdom"],
+    row: NamedTuple) -> List[str]: 
+
+    """Run motu-profiler on a single sample."""
+
+    sample_name = row.Index
+    reads = row[1:]
+
+    cmd = ["motus", "profile"]
+
+    # single-end reads
+    if len(reads) == 1:
+        cmd.extend(["-s", reads[0]])
+    # paired-end reads
+    elif len(reads) == 2:
+        cmd.extend(["-f", reads[0], "-r", reads[1]])
+
+    cmd.extend(["-n", sample_name, 
+                "-o", os.path.join(output_directory, sample_name + ".motus"), 
+                "-t", str(threads),
+                "-l", str(min_alen),
+                "-g", str(marker_gene_cutoff),
+                "-y", str(mode),
+                "-k", str(taxonomy_level),
+                "-c"])
+
+    if reference_genomes:
+        cmd.extend(["-e"])
+
+    if ncbi_taxonomy:
+        cmd.extend(["-p"])
+
+    if full_taxonomy:
+        cmd.extend(["-q"])
+
+    p = _run_command(cmd)
+
+    return cmd
+
+
+def load_table(tab_fp: str) -> pd.DataFrame:
+    '''Converts merged mOTU table to biom feature table'''
+    df = pd.read_csv(tab_fp, sep='\t', index_col=0, skiprows=2)
+    df = df.replace({0 : np.nan})
+    # dropping unassigned species
+    tab = df.dropna(axis = 0, thresh = df.shape[1] - 1).fillna(0).T
+    return tab
+
+
+def classify(
+    samples: Union[SingleLanePerSamplePairedEndFastqDirFmt, 
+                   SingleLanePerSampleSingleEndFastqDirFmt],
+    threads: int, 
+    min_alen: int = 75, 
+    marker_gene_cutoff: int = 3,
+    mode: Literal["base.coverage", "insert.raw_counts", "insert.scaled_counts"] = "insert.scaled_counts",
+    reference_genomes: bool = False,
+    ncbi_taxonomy: bool = False,
+    full_taxonomy: bool = False,
+    taxonomy_level: Literal["mOTU", "genus", "family", "order", "class", "phylum", "kingdom"] = "mOTU",
+    ) -> pd.DataFrame:
+    """Run motu-profiler on paired-end samples data.
+
+    Parameters
+    ----------
+    sequencing_data : PairEndSequencesWithQuality
+        The paired-end sequencing data to be profiled.
+    threads : int
+        Number of threads to use.
+    """
+
+    id_to_fps = preprocess_manifest(samples)
+    # Create temporary directory
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        profile_dir = os.path.join(temp_dir, "profiles")
+        # consistent output and number of threads
+        profile_sample_partial = partial(profile_sample, profile_dir, threads, min_alen, marker_gene_cutoff, mode,
+                                         reference_genomes, ncbi_taxonomy, full_taxonomy, taxonomy_level)
+        # iterate over samples
+        for row in id_to_fps.itertuples():
+            os.makedirs(profile_dir, exist_ok=True)
+            profile_sample_partial(row)
+
+        # merge profiles
+        taxatable = os.path.join(temp_dir, "motus.merged")
+        cmd = ["motus", "merge", "-d", profile_dir, "-o", taxatable]
+        _run_command(cmd)
+
+        # output merged profiles as feature table
+        return load_table(taxatable)
diff --git a/q2_motus/plugin_setup.py b/q2_motus/plugin_setup.py
@@ -1,28 +1,51 @@
 import q2_motus
 
-from qiime2.plugin import (Int, Str, Float, Plugin, Citations, Metadata,
-                           Visualization)
+from qiime2.plugin import (Int, Range, Str, Bool, Choices, Plugin, Citations) 
 
-from q2_motus._hello import print_hello
+from q2_motus._taxonomy import classify
+from q2_types.sample_data import SampleData
+from q2_types.per_sample_sequences import SequencesWithQuality, PairedEndSequencesWithQuality
+from q2_types.feature_table import FeatureTable, Frequency
+
+citations = Citations.load('citations.bib', package='q2_motus')
 
 plugin = Plugin(
     name='motus',
     version=q2_motus.__version__,
     website="https://github.com/motu-tool/q2-motus",
     package='q2_motus',
-    citations=Citations.load('citations.bib', package='q2_motus'),
+    citations=[citations["Milanese2019-gw"]],
     description=('This QIIME 2 plugin allows taxonomical profiling of '
                  'metagenomic samples using mOTU tool.'),
     short_description='Taxonomical profiling of metagenomic samples using mOTU.'
 )
 
 
-plugin.visualizers.register_function(
-    function=print_hello,
-    inputs={},
-    parameters={},
-    input_descriptions={},
-    parameter_descriptions={},
-    name='Print hello.',
-    description='Print hello on the screen.'
-)
+plugin.methods.register_function(
+    function=classify,
+    inputs={"samples": SampleData[SequencesWithQuality | PairedEndSequencesWithQuality]},
+    outputs=[("table", FeatureTable[Frequency]), 
+             ("taxonomy", FeatureData[Taxonomy])],
+    parameters={"threads": Int % Range(1, None),
+                "min_alen": Int % Range(1, None),
+                "marker_gene_cutoff": Int % Range(1, 10),
+                "mode": Str % Choices(["base.coverage", "insert.raw_counts", "insert.scaled_counts"]),
+                "reference_genomes": Bool,
+                "ncbi_taxonomy": Bool,
+                "full_taxonomy": Bool,
+                "taxonomy_level": Str % Choices(["mOTU", "genus", "family", "order", "class", "phylum", "kingdom"])},
+    name="mOTU paired end profiler",
+    description="Executes a taxonomical classification of paired-end sample.",
+    input_descriptions={"samples": "The paired-end samples to be classified."},
+    parameter_descriptions={"threads": "The number of threads to use.",
+                            "min_alen": "Minimum alignment length.",
+                            "marker_gene_cutoff": "Minimum number of marker genes to be considered a species." 
+                                                  "Ranges from 1 to 10. A higher value increases precision (and lowers recall)",
+                            "mode": "The mode to use for abundance estimation." 
+                                    "base.coverage measures the average base coverage of the gene."
+                                    "insert.raw_counts measures the number of reads that map to the gene."
+                                    "insert.scaled_counts measures the number of reads that map to the gene, scaled by the length of the gene.",
+                            "reference_genomes": "Use only species with reference genomes (ref-mOTUs).",
+                            "ncbi_taxonomy": "Use NCBI taxonomy instead of mOTU.",
+                            "full_taxonomy": "Use full taxonomy.",
+                            "taxonomy_level": "The taxonomy level to use for profiling."})
diff --git a/q2_motus/tests/__init__.py b/q2_motus/tests/__init__.py
Original file line number	Diff line number	Diff line change
Expand Up		@@ -127,3 +127,5 @@ dmypy.json

		# Pyre type checker
		.pyre/

		q2_motus/tests/data/dev.ipynb