bgruening · tcollins2011 · Oct 8, 2024 · Oct 9, 2024 · Oct 9, 2024 · Oct 9, 2024
diff --git a/tools/sylph/.shed.yml b/tools/sylph/.shed.yml
@@ -0,0 +1,13 @@
+name: sylph
+owner: bgruening
+description: sylph - fast and precise species-level metagenomic profiling with ANIs
+long_description: sylph is a program that performs ultrafast (1) ANI querying or (2) metagenomic profiling for metagenomic shotgun samples.
+homepage_url: https://github.com/bluenote-1577/sylph
+remote_repository_url: https://github.com/bgruening/galaxytools/main/tools/sylph
+categories:
+  - Metagenomics
+type: unrestricted
+auto_tool_repositories:
+  name_template: "{{ tool_id }}"
+  description_template: "{{ tool_name }} from the sylph suite"
+
diff --git a/tools/sylph/README.md b/tools/sylph/README.md
@@ -0,0 +1,10 @@
+For Galaxy admins and local runs:
+
+The databases for sylph have associated metadata files. These files MUST be paired with the correct databases to output correctly. Here is the easiest location to download databases and metadata files:
+For databases: https://github.com/bluenote-1577/sylph/wiki/Pre%E2%80%90built-databases
+For metadata: https://github.com/bluenote-1577/sylph-utils
+
+The tool assumes the directory the data_table references to be
+<name_of_organism>
+    - database.syldb
+    - metadata.tsv.gz
diff --git a/tools/sylph/macros.xml b/tools/sylph/macros.xml
@@ -0,0 +1,148 @@
+<macros>
+    <token name="@TOOL_VERSION@">0.6.1</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@LICENSE@">MIT</token>
+    <token name="@DB_SELECTOR@"><![CDATA[
+        #if $database_select.select == 'cached':
+        ln -s '$database_select.sylph_database.fields.path/database.syldb' 'database.syldb' &&
+        #else:
+            ln -s '$database_select.sylph_database' 'database.syldb' &&
+        #end if
+    ]]></token>
+    <token name="@SINGLE_INPUT@"><![CDATA[
+        #if $sketch.input.ext == 'fastqsanger'
+        #set $ext = 'fastq'
+        #else if $sketch.input.ext == 'fastqsanger.gz':
+            #set $ext = 'fastq.gz'
+        #else:
+            #set $ext = str($sketch.input.ext)
+        #end if
+        #if $sketch.input.element_identifier.endswith('.fastq') or $sketch.input.element_identifier.endswith('.fastq.gz'):
+            #set $input = re.sub(r'\s+', '_', $sketch.input.element_identifier)
+        #else:
+            #set $input = re.sub(r'\s+', '_', $sketch.input.element_identifier + '.' + str($ext))
+        #end if
+        ln -s '$sketch.input' '$input' &&
+    ]]></token>
+    <token name="@SINGLE_GROUP@"><![CDATA[
+        #set input = ''
+        #for $number, $current_file in enumerate($sketch.input):
+            #if $current_file.ext == 'fastqsanger'
+                #set $ext = 'fastq'
+            #else if $current_file.ext == 'fastqsanger.gz':
+                #set $ext = 'fastq.gz'
+            #else:
+                #set $ext = str($current_file.ext)
+            #end if
+            #if $current_file.element_identifier.endswith('.fastq') or $current_file.element_identifier.endswith('.fastq.gz'):
+                #set $current_input = re.sub(r'\s+', '_', $current_file.element_identifier)
+            #else:
+                #set $current_input = re.sub(r'\s+', '_', $current_file.element_identifier + '.' + str($ext))
+            #end if
+            ln -s '${current_file}' '$current_input' &&
+            #set input = str($input) + ' ' + str($current_input)
+        #end for
+    ]]></token>
+    <token name="@PAIRED@"><![CDATA[
+        #if $sketch.input_1.ext == 'fastqsanger'
+                #set $ext_1 = 'fastq'
+            #else if $sketch.input_1.ext == 'fastqsanger.gz':
+                #set $ext_1 = 'fastq.gz'
+            #else:
+                #set $ext_1 = str($sketch.input_1.ext)
+            #end if
+
+            #if $sketch.input_2.ext == 'fastqsanger'
+                #set $ext_2 = 'fastq'
+            #else if $sketch.input_2.ext == 'fastqsanger.gz':
+                #set $ext_2 = 'fastq.gz'
+            #else:
+                #set $ext_2 = str($sketch.input_2.ext)
+            #end if
+
+            #if $sketch.input_1.element_identifier.endswith('.fastq') or $sketch.input_1.element_identifier.endswith('.fastq.gz'):
+                #set $read1 = re.sub(r'\s+', '_', $sketch.input_1.element_identifier)
+            #else:
+                #set $read1 = re.sub(r'\s+', '_', str($sketch.input_1.element_identifier) + '.' + str($ext_1))
+            #end if
+            #if $sketch.input_2.element_identifier.endswith('.fastq') or $sketch.input_2.element_identifier.endswith('.fastq.gz'):
+                #set $read2 = re.sub(r'\s+', '_', $sketch.input_2.element_identifier)
+            #else:
+                #set $read2 = re.sub(r'\s+', '_', str($sketch.input_2.element_identifier) + '.' + str($ext_2))
+            #end if
+            ln -s '$sketch.input_1' '$read1' &&
+            ln -s '$sketch.input_2' '$read2' &&
+    ]]></token>
+    <token name="@PAIRED_GROUP@"><![CDATA[
+        #if $sketch.input.forward.ext == 'fastqsanger'
+            #set $ext_1 = 'fastq'
+        #else if $sketch.input.forward.ext == 'fastqsanger.gz':
+            #set $ext_1 = 'fastq.gz'
+        #else:
+            #set $ext_1 = str($sketch.input.forward.ext)
+        #end if
+
+        #if $sketch.input.reverse.ext == 'fastqsanger'
+            #set $ext_2 = 'fastq'
+        #else if $sketch.input.reverse.ext == 'fastqsanger.gz':
+            #set $ext_2 = 'fastq.gz'
+        #else:
+            #set $ext_2 = str($sketch.input.reverse.ext)
+        #end if
+
+        #set $read1 = re.sub(r'\s+', '_', str($sketch.input.element_identifier) + '.' + str($ext_1))
+        #set $read2 = re.sub(r'\s+', '_', str($sketch.input.element_identifier) + '_r2.' + str($ext_2))
+        ln -s '$sketch.input.forward' '$read1' &&
+        ln -s '$sketch.input.reverse' '$read2' &&
+    ]]></token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">sylph</requirement>
+            <requirement type="package" version="3.11.9">python</requirement>
+            <requirement type="package" version="2.2.3">pandas</requirement>
+        </requirements>
+    </xml>
+    <xml name="description">
+        <description>fast and precise species-level metagenomic profiling with ANIs</description>
+    </xml>
+    <xml name="citation">
+        <citations>
+            <citation type="doi">10.1038/s41587-024-02412-y</citation>
+        </citations>
+    </xml>
+    <xml name="creator">
+        <creator>
+            <organization name="University of Toronto" url="https://github.com/bluenote-1577/sylph"/>
+        </creator>
+    </xml>
+    <xml name="xrefs">
+        <xrefs>
+            <xref type="bio.tools">sylph</xref>
+        </xrefs>
+    </xml>
+    <xml name="input_database">
+        <conditional name="database_select">
+            <param name="select" type="select" label="Choose the source for databases and metadata">
+                <option value="cached">Cached data</option>
+                <option value="history">History</option>
+            </param>
+            <when value="cached">
+                <param label="Select a sylph database" name="sylph_database" type="select">
+                    <options from_data_table="sylph_databases">
+                        <validator message="No Sylph databases are available" type="no_options" />
+                    </options>
+                </param>
+            </when>
+            <when value="history">
+                <param label="Select a history dataset" name="sylph_database" type="data" format="binary" />
+                <param label="Metadata file for metaphlan and krona outputs" name="metadata" type="data" format="tabular.gz" optional="true" help="The metata file MUST be directly associated with the input database. For more information, view the help text of the tool."/>
+            </when>
+        </conditional>
+    </xml>
+    <xml name="output_format">
+        <param label="Additional output formats" name="outputs" type="select" display="checkboxes" multiple="true" help="In addition to Sylph's tabular output, you may ouput a file converted to these formats">
+            <option value="metaphlan">Sylph's MetaPhlAn-like output</option>
+            <option value="krona">Krona compatible</option>
+        </param>
+    </xml>
+</macros>
diff --git a/tools/sylph/sylph_profile.xml b/tools/sylph/sylph_profile.xml
@@ -0,0 +1,210 @@
+<tool id="sylph_profile" name="sylph profile" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="23.0">
+    <expand macro='description'/>
+    <macros>
+       <import>macros.xml</import> 
+    </macros>
+    <expand macro='requirements'/>
+    <command detect_errors='exit_code'><![CDATA[
+#import re
+##SYMLINK SYLPH DB
+@DB_SELECTOR@
+##Single input
+        #if $sketch.type == 'single':
+            @SINGLE_INPUT@
+##Single group
+        #else if $sketch.type == 'single_group':
+            @SINGLE_GROUP@
+##Paired input
+        #else if $sketch.type == 'paired':
+            @PAIRED@
+##Paired  group
+        #else if $sketch.type == 'paired_group':
+            @PAIRED_GROUP@
+        #end if
+##SKETCHING
+        sylph sketch
+        #if $sketch.type == 'single':
+            $input
+        #else if $sketch.type == 'single_group':
+            -r $input
+        #else if $sketch.type == 'paired': 
+            -1 $read1
+            -2 $read2
+        #else if $sketch.type == 'paired_group': 
+            -1 $read1
+            -2 $read2
+        #end if
+        -t \${GALAXY_SLOTS:-4}
+        -d sylph_sketches &&
+##MAIN COMMAND
+        sylph profile
+        database.syldb
+        sylph_sketches/*.sylsp
+        #if $min_num_kmers:
+            --min-number-kmers ${min_num_kmers}
+        #end if
+        -t \${GALAXY_SLOTS:-4}
+        -o $output
+
+        #if $outputs:
+            #if $database_select.select == 'cached':
+                && ln -s '$database_select.sylph_database.fields.path/database.tsv.gz' 'database.tsv.gz'
+            #else:
+                && ln -s '$database_select.metadata' 'database.tsv.gz'
+            #end if
+            && python '$__tool_directory__/sylph_to_taxprof.py' -s $output -m database.tsv.gz -o metaphlan_
+            #if 'krona' in $outputs:
+                && python '$__tool_directory__/sylphformatoutput.py' format_for_krona --metaphlan_output *.sylphmpa --krona_output krona.tsv
+                && mkdir krona_out && mv *krona.tsv krona_out
+            #end if
+            && mkdir metaphlan_out && mv *.sylphmpa metaphlan_out
+        #end if
+    ]]></command>
+    <inputs>
+        <conditional name="sketch">
+            <param name="type" type="select" label="Select the type of reads used">
+                <option value="single">Individual single-end reads</option>
+                <option value="single_group">Group of single-ended reads</option>
+                <option value="paired">One set of paired-end reads</option>
+                <option value="paired_group"> Group of paired-ended reads</option>
+            </param>
+        <!-- Only permitting fastq as tool input only allows fastq and fastq.gz as file ext -->
+            <when value="single">
+                <param name="input" type="data" format="fastq,fastq.gz,fastqsanger,fastqsanger.gz" label="Single-end input reads"/>
+            </when>
+            <when value="single_group">
+                <param name="input" type="data" format="fastq,fastq.gz,fastqsanger,fastqsanger.gz" label="Single-end input reads" multiple="true"/>
+            </when>            
+            <when value="paired">
+                <param name="input_1" type="data" format="fastq,fastq.gz,fastqsanger,fastqsanger.gz" label="Paired-end input reads 1"/>
+                <param name="input_2" type="data" format="fastq,fastq.gz,fastqsanger,fastqsanger.gz" label="Paired-end input reads 2"/>
+            </when>
+            <when value="paired_group">
+                <param name="input" type="data_collection" format="fastq,fastq.gz,fastqsanger,fastqsanger.gz" label="Paired-end input reads" collection_type="paired"/>
+            </when>
+        </conditional>
+        <param name="min_num_kmers" type="integer" min="1" value="50" label="Minimum number of k-mers for Sylph to output a result." help="States the minimum number of k-mers needed for sylph to output a result.This is (approximately) the contig length divided by -c. With default settings, --min-number-kmers 10 can work with contigs ~2500 bp. For smaller contigs, consider -c 100."/>
+        <expand macro="output_format"/>
+        <expand macro="input_database"/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output" label="${tool.name} on ${on_string}"/>
+        <collection  name="metaphlan_out" type="list" label="${tool.name} on ${on_string}: MetaPhlAn-style output">
+            <filter> outputs and 'metaphlan' in outputs</filter>
+            <discover_datasets pattern="__name_and_ext__"  directory="metaphlan_out/" />
+        </collection>
+        <collection  name="krona_out" type="list" label="${tool.name} on ${on_string}: Krona-useable output">
+            <filter>outputs and 'krona' in outputs</filter>
+            <discover_datasets pattern="__name_and_ext__"  directory="krona_out/" />
+        </collection>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <param name="sylph_database" value="sylph_db"/>
+            <conditional name="sketch">
+                <param name="type" value="single"/>
+                <param name="input" value="single_1.fastq.gz" ftype="fastq"/>
+            </conditional>
+            <output name="output" value="output_1.tabular"/>
+        </test>
+
+        <!--Test 2  Group of Single-end Inputs-->
+        <test expect_num_outputs="1">
+            <param name="sylph_database" value="sylph_db"/>
+            <conditional name="sketch">
+                <param name="type" value="single_group"/>
+                <param name="input" value="single_1.fastq.gz,single_2.fastq.gz" ftype="fastq"/>
+            </conditional>
+            <param name="min_num_kmers" value="49"/>
+            <output name="output" value="output_2.tabular" compare="sim_size"/>
+        </test>
+
+        <!-- Test 3 Paired-end reads -->
+        <test expect_num_outputs="1">
+            <param name="sylph_database" value="sylph_db"/>
+            <conditional name="sketch">
+                <param name="type" value="paired"/>
+                <param name="input_1" value="test R1.fq" ftype="fastq"/>
+                <param name="input_2" value="test R2.fq" ftype="fastqsanger"/>
+            </conditional>
+            <output name="output" value="output_3.tabular"/>
+        </test>
+
+        <!-- Test 4 Collection of Paired-end Reads -->
+        <test expect_num_outputs="1">
+            <param name="sylph_database" value="sylph_db"/>
+            <conditional name="sketch">
+                <param name="type" value="paired_group"/>
+                <param name="input">
+                    <collection type="paired" name="test">
+                        <element name="forward" ftype="fastq" value="test R1.fq"/>
+                        <element name="reverse" ftype="fastq" value="test R2.fq"/>
+                    </collection>
+                </param>
+            </conditional>
+            <output name="output" value="output_4.tabular"/>
+        </test>
+        <!-- Test 5 output format conversion -->
+        <test expect_num_outputs="3">
+            <param name="sylph_database" value="sylph_db"/>
+            <conditional name="sketch">
+                <param name="type" value="single"/>
+                <param name="input" value="single_1.fastq.gz" ftype="fastq"/>
+            </conditional>
+            <param name="outputs" value="metaphlan,krona"/>
+            <!-- With test data, output will be empty for krona tool so only check against metaphlan converter, but keep num_outputs-->
+            <output_collection name="metaphlan_out" type="list">
+                <element name="metaphlan_single_1.fastq.gz" value="test.sylphmpa"/>
+            </output_collection>
+        </test>
+    </tests>
+    <help><![CDATA[
+**What is sylph?**
+
+Sylph is an extremely fast and memory efficient program for profiling and searching metagenomic samples against databases. It is 10-100x faster than other popular software such as MetaPhlAn or Kraken and more memory efficient too. 
+
+**What can sylph do?**
+
+- Profile metagenomes: sylph can calculate the abundances of genomes in a sample using a reference database. This is the same type of output as Kraken or MetaPhlAn. 
+- Search genomes against metagenomes: sylph can check if a genome is contained in your sample (e.g. is this E. coli genome in my sample?).
+- ANI querying: sylph can estimate the containment average nucleotide identity (ANI) of a reference genome to the genomes in your sample.
+- Use custom reference databases: Eukaryotes, viruses, and any collections of fasta files are ok.
+- Long-reads are usable: sylph is primarily optimized for short-reads, but it can utilize nanopore or PacBio reads with high precision.
+- Calculate coverage: sylph can estimate the coverage (not just the abundance) of genomes in your database.
+
+`[See here for more information on what sylph can and can not do]. <https://github.com/bluenote-1577/sylph/wiki/Introduction:-what-is-sylph-and-how-does-it-work%3F>`_
+
+----
+
+**Output**
+
+Sylph profile outputs a TSV (tab-separated values) file. Each row is one genome detected in the metagenome sample.
+    - Sample_file: the filename of the reads/sample.
+    - Genome_file: the filename of the detected genome.
+    - Taxonomic_abundance: normalized taxonomic abundance as a percentage. Coverage-normalized - same as MetaPhlAn abundance
+    - Sequence_abundance: normalized sequence abundance as a percentage. The "number of reads" assigned to each genome - same as Kraken abundance
+    - Adjusted_ANI: adjusted containment ANI estimate.
+        - If coverage adjustment is possible (cov is < 3x cov): returns coverage-adjusted ANI
+        - If coverage is too low/high: returns Naive_ANI (see below)
+    - Eff_cov/True_cov: an estimate of the effective, or if -u specified, the true coverage. Always a decimal number.
+    - ANI_5-95_percentile: [5%,95%] confidence intervals. Not always a decimal number.
+        - If coverage adjustment is possible: float-float e.g. 98.52-99.55
+        - If coverage is too low/high: NA-NA is given.
+    - Eff_lambda: estimate of the effective coverage parameter. Not always a decimal number.
+        - If coverage adjustment is possible: lambda estimate is given
+        - If coverage is too low/high: LOW or HIGH is output
+    - Lambda_5-95_percentile: [5%, 95%] confidence intervals for lambda. Same format rules as ANI_5-95_percentile.
+    - Median_cov: median k-mer multiplicity for k-mers with >= 1 multiplicity.
+    - Mean_cov_geq1: mean k-mer multiplicity for k-mers with >= 1 multiplicity.
+    - Containment_ind: int/int showing the containment index (number of k-mers found in sample divided by total k-mers), e.g. 959/1053.
+    - Naive_ANI: containment ANI without coverage adjustment.
+    - kmers_reassigned: the number of k-mers reassigned away from the genome.
+    - Contig_name: name of the first contig in the genome
+
+Additional files are able to be output. The metaphlan-style output is formatted similarly to that output by the `[Metaphlan <toolshed.g2.bx.psu.edu/repos/iuc/metaphlan/metaphlan/4.1.1+galaxy3>`_ 
+This output is *NOT* compatible with Krona directly. For that, please select the Krona-style output option.
+
+
+    ]]></help>
+    <expand macro="citation"/>
+</tool>