nf-core · jvhagey · May 2, 2022 · May 2, 2022 · May 2, 2022 · May 2, 2022
diff --git a/modules/busco/main.nf b/modules/busco/main.nf
@@ -0,0 +1,57 @@
+process BUSCO {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda (params.enable_conda ? "bioconda::busco=5.3.2" : null)
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/busco:5.3.2--pyhdfd78af_0':
+        'quay.io/biocontainers/busco:5.3.2--pyhdfd78af_0' }"
+
+    input:
+    tuple val(meta), path(fasta)
+
+    output:
+    tuple val(meta), path("$meta.id/short_summary.*.txt")                                                           , emit: short_summary_txt
+    tuple val(meta), path("$meta.id/short_summary.*.json")                                                          , emit: short_summary_json
+    tuple val(meta), path("$meta.id/run_*/full_table.tsv")                                                          , emit: run_full_table
+    tuple val(meta), path("$meta.id/run_*/short_summary.txt")                                                       , emit: run_short_summary_txt
+    tuple val(meta), path("$meta.id/run_*/short_summary.json")                                                      , emit: run_short_summary_json
+    tuple val(meta), path("$meta.id/run_*/missing_busco_list.tsv")                                                  , emit: run_missing_busco_list
+    tuple val(meta), path("$meta.id/run_*/hmmer_output/*.out")                                       , optional:true, emit: hmmer_output
+    tuple val(meta), path("$meta.id/run_*/blast_output/coordinates.tsv")                             , optional:true, emit: blast_coordinates
+    tuple val(meta), path("$meta.id/run_*/blast_output/tblastn.tsv")                                 , optional:true, emit: tblastn
+    tuple val(meta), path("$meta.id/run_*/blast_output/sequences/*.temp")                            , optional:true, emit: blast_sequences
+    tuple val(meta), path("$meta.id/run_*/busco_sequences/single_copy_busco_sequences/*.{fna,faa}")  , optional:true, emit: single_copy_busco_sequences
+    tuple val(meta), path("$meta.id/run_*/busco_sequences/multi_copy_busco_sequences/*.{fna,faa}")   , optional:true, emit: multi_copy_busco_sequences
+    tuple val(meta), path("$meta.id/run_*/busco_sequences/fragmented_busco_sequences/*.{fna,faa}")   , optional:true, emit: fragmented_busco_sequences
+    tuple val(meta), path("$meta.id/prodigal_output/predicted_genes/predicted.fna")                  , optional:true, emit: predicted_fna
+    tuple val(meta), path("$meta.id/prodigal_output/predicted_genes/predicted.faa")                  , optional:true, emit: predicted_faa
+    tuple val(meta), path("$meta.id/translated_sequences/*.faa")                                     , optional:true, emit: translated_sequences
+    path "versions.yml"                                                                                             , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    // handling lineage
+    def lineage = meta.lineage ? "--lineage_dataset ${meta.lineage}" : ""
+
+    """
+    gzip -cdf ${fasta} > __UNCOMPRESSED_FASTA_FILE__
+
+    busco \\
+        --in __UNCOMPRESSED_FASTA_FILE__ \\
+        --mode ${meta.mode} \\
+        --out $meta.id \\
+        -c $task.cpus \\
+        ${lineage} \\
+        $args
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        busco: \$( busco --version 2>&1 | sed 's/^BUSCO //' )
+    END_VERSIONS
+    """
+}
diff --git a/modules/busco/meta.yml b/modules/busco/meta.yml
@@ -0,0 +1,116 @@
+name: busco
+description: Evaluation of the quality of genomic “data products” such as genome assemblies or gene sets.
+keywords:
+  - sort
+tools:
+  - busco:
+      description: BUSCO provides measures for quantitative assessment of genome assembly, gene set, and transcriptome completeness based on evolutionarily informed expectations of gene content from near-universal single-copy orthologs selected from OrthoDB.
+      homepage: "https://busco.ezlab.org/"
+      documentation: "https://busco.ezlab.org/busco_userguide.html"
+      tool_dev_url: "https://gitlab.com/ezlab/busco/-/releases#5.2.1"
+      doi: "10.1002/cpz1.323"
+      licence: ["MIT"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', mode:"protein", lineage:"", autolineage:false]
+  - fasta:
+      type: file
+      description: |
+        Either a nucleotide fasta file or a protein fasta file, depending on the BUSCO mode.
+        As of v5.1.0 the input argument can now also be a directory containing fasta files to run in batch mode.
+      pattern: "*.fasta" #How do I handle passing a directory to this?
+  - mode:
+      type: value
+      description: Sets the assessment MODE as "genome", "protein", "transcriptome".
+  - lineage:
+      type: value
+      description: |
+        It can be a dataset name, i.e. bacteria_odb10, or a path i.e. ./bacteria_odb10 or /home/user/bacteria_odb10.
+        In the former case, which is the recommended usage, BUSCO will automatically download and version the corresponding dataset.
+        You can get the list of possible datasets by running "busco --list-datasets" see "www.busco.ezlab.org/list_of_lineages.html" and "www.busco.ezlab.org/busco_userguide.html#lineage-datasets".
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - short_summary_txt:
+      type: txt
+      description: Contains a plain text summary of the results in BUSCO notation.
+      pattern: "./short_summary.*.txt"
+  - short_summary_json:
+      type: json
+      description: Contains a summary of the results in JSON form.
+      pattern: "./short_summary.*.json"
+  - run_full_table:
+      type: tsv
+      description: Contains the complete results in a tabular format with scores and lengths of BUSCO matches, and coordinates (for genome mode) or gene/protein IDs (for transcriptome or protein mode).
+      pattern: "./run_*/full_table.tsv"
+  - run_short_summary_txt:
+      type: txt
+      description: Contains a plain text summary of the results in BUSCO notation.
+      pattern: "./run_*/short_summary.*.txt"
+  - run_short_summary_json:
+      type: json
+      description: Contains a summary of the results in JSON form.
+      pattern: "./run_*/short_summary.*.json"
+  - missing_busco_list:
+      type: tsv
+      description: Contains a list of missing BUSCOs.
+      pattern: "./run_*/missing_busco_list.tsv"
+  - hmmer_output:
+      type: out
+      description: Tabular format HMMER output of searches with BUSCO HMMs.
+      pattern: "./run_*/missing_busco_list.tsv"
+  - blast_coordinates:
+      type: tsv
+      description: locations of BUSCO matches (eukaryotic genome).
+      pattern: "./run_*/blast_output/coordinates.tsv"
+  - tblastn:
+      type: tsv
+      description: Tabular tBLASTn results.
+      pattern: "./run_*/blast_output/tblastn.tsv"
+  - blast_sequences:
+      type: fasta
+      description: Sequences having blast results.
+      pattern: "./run_*/blast_output/sequences/*.temp"
+  - single_copy_busco_sequences:
+      type: fasta
+      description: FASTA format file for each BUSCO gene identified. ".faa" files contain protein sequences, ".fna" files (where created) contain coding sequences.
+      pattern: "*.{fna,faa}"
+  - multi_copy_busco_sequences:
+      type: fasta
+      description: FASTA format file for each BUSCO gene identified. ".faa" files contain protein sequences, ".fna" files (where created) contain coding sequences.
+      pattern: "*.{fna,faa}"
+  - fragmented_busco_sequences:
+      type: fasta
+      description: FASTA format file for each BUSCO gene identified. ".faa" files contain protein sequences, ".fna" files (where created) contain coding sequences.
+      pattern: "*.{fna,faa}"
+  - predicted_fna:
+      type: fna
+      description: A nucleotide file for each predicted gene
+      pattern: "./prodigal_output/predicted_genes/predicted.fna"
+  - predicted_faa:
+      type: faa
+      description: A protein file for each predicted gene
+      pattern: "./prodigal_output/predicted_genes/predicted.faa"
+  - translated_sequences:
+      type: faa
+      description: |
+        Six frame translations of each transcript made by the transcriptome mode.
+        It is a naive translation, ignoring start and stop codons only in order to apply hmmsearch and do not represent proteins.
+      pattern: "./translated_sequences/*.faa"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+
+authors:
+  - "@nvlachos"
+  - "@jvhagey"
+  - "@priyanka-surana"
diff --git a/tests/config/pytest_modules.yml b/tests/config/pytest_modules.yml
@@ -337,6 +337,10 @@ bracken/bracken:
   - modules/bracken/bracken/**
   - tests/modules/bracken/bracken/**
 
+busco:
+  - modules/busco/**
+  - tests/modules/busco/**
+
 bwa/aln:
   - modules/bwa/aln/**
   - tests/modules/bwa/aln/**

diff --git a/tests/modules/busco/main.nf b/tests/modules/busco/main.nf
@@ -0,0 +1,33 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl = 2
+
+include { BUSCO } from '../../../modules/busco/main.nf'
+
+/*workflow test_busco_genome_augustus {
+    input = [ [ id:'test', mode:"genome", lineage:"bacteroidales_odb10"],
+        file(params.test_data['bacteroides_fragilis']['genome']['genome_fna_gz'], checkIfExists: true) ]
+
+    BUSCO(input)
+}*/
+
+workflow test_busco_transcriptome {
+    input = [ [ id:'test', mode:"transcriptome", lineage:""],
+        file(params.test_data['bacteroides_fragilis']['illumina']['test1_contigs_fa_gz'], checkIfExists: true) ]
+
+    BUSCO(input)
+}
+
+workflow test_busco_protein {
+    input = [ [ id:'test', mode:"protein", lineage:"saccharomycetes_odb10"],
+        file(params.test_data['candidatus_portiera_aleyrodidarum']['genome']['proteome_fasta'], checkIfExists: true) ]
+
+    BUSCO(input)
+}
+
+workflow test_busco_genome {
+    input = [ [ id:'test', mode:"genome", lineage:"bacteroidales_odb10"],
+        file(params.test_data['bacteroides_fragilis']['genome']['genome_fna_gz'], checkIfExists: true) ]
+
+    BUSCO(input)
+}
diff --git a/tests/modules/busco/nextflow.config b/tests/modules/busco/nextflow.config
@@ -0,0 +1,4 @@
+process {
+
+    publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
+}