Skip to content

new module: Busco #1605

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions modules/busco/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
process BUSCO {
tag "$meta.id"
label 'process_medium'

conda (params.enable_conda ? "bioconda::busco=5.3.2" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/busco:5.3.2--pyhdfd78af_0':
'quay.io/biocontainers/busco:5.3.2--pyhdfd78af_0' }"

input:
tuple val(meta), path(fasta)

output:
tuple val(meta), path("$meta.id/short_summary.*.txt") , emit: short_summary_txt
tuple val(meta), path("$meta.id/short_summary.*.json") , emit: short_summary_json
tuple val(meta), path("$meta.id/run_*/full_table.tsv") , emit: run_full_table
tuple val(meta), path("$meta.id/run_*/short_summary.txt") , emit: run_short_summary_txt
tuple val(meta), path("$meta.id/run_*/short_summary.json") , emit: run_short_summary_json
tuple val(meta), path("$meta.id/run_*/missing_busco_list.tsv") , emit: run_missing_busco_list
tuple val(meta), path("$meta.id/run_*/hmmer_output/*.out") , optional:true, emit: hmmer_output
tuple val(meta), path("$meta.id/run_*/blast_output/coordinates.tsv") , optional:true, emit: blast_coordinates
tuple val(meta), path("$meta.id/run_*/blast_output/tblastn.tsv") , optional:true, emit: tblastn
tuple val(meta), path("$meta.id/run_*/blast_output/sequences/*.temp") , optional:true, emit: blast_sequences
tuple val(meta), path("$meta.id/run_*/busco_sequences/single_copy_busco_sequences/*.{fna,faa}") , optional:true, emit: single_copy_busco_sequences
tuple val(meta), path("$meta.id/run_*/busco_sequences/multi_copy_busco_sequences/*.{fna,faa}") , optional:true, emit: multi_copy_busco_sequences
tuple val(meta), path("$meta.id/run_*/busco_sequences/fragmented_busco_sequences/*.{fna,faa}") , optional:true, emit: fragmented_busco_sequences
tuple val(meta), path("$meta.id/prodigal_output/predicted_genes/predicted.fna") , optional:true, emit: predicted_fna
tuple val(meta), path("$meta.id/prodigal_output/predicted_genes/predicted.faa") , optional:true, emit: predicted_faa
tuple val(meta), path("$meta.id/translated_sequences/*.faa") , optional:true, emit: translated_sequences
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
// handling lineage
def lineage = meta.lineage ? "--lineage_dataset ${meta.lineage}" : ""

"""
gzip -cdf ${fasta} > __UNCOMPRESSED_FASTA_FILE__

busco \\
--in __UNCOMPRESSED_FASTA_FILE__ \\
--mode ${meta.mode} \\
--out $meta.id \\
-c $task.cpus \\
${lineage} \\
$args

cat <<-END_VERSIONS > versions.yml
"${task.process}":
busco: \$( busco --version 2>&1 | sed 's/^BUSCO //' )
END_VERSIONS
"""
}
116 changes: 116 additions & 0 deletions modules/busco/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
name: busco
description: Evaluation of the quality of genomic “data products” such as genome assemblies or gene sets.
keywords:
- sort
tools:
- busco:
description: BUSCO provides measures for quantitative assessment of genome assembly, gene set, and transcriptome completeness based on evolutionarily informed expectations of gene content from near-universal single-copy orthologs selected from OrthoDB.
homepage: "https://busco.ezlab.org/"
documentation: "https://busco.ezlab.org/busco_userguide.html"
tool_dev_url: "https://gitlab.com/ezlab/busco/-/releases#5.2.1"
doi: "10.1002/cpz1.323"
licence: ["MIT"]

input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', mode:"protein", lineage:"", autolineage:false]
- fasta:
type: file
description: |
Either a nucleotide fasta file or a protein fasta file, depending on the BUSCO mode.
As of v5.1.0 the input argument can now also be a directory containing fasta files to run in batch mode.
pattern: "*.fasta" #How do I handle passing a directory to this?
- mode:
type: value
description: Sets the assessment MODE as "genome", "protein", "transcriptome".
- lineage:
type: value
description: |
It can be a dataset name, i.e. bacteria_odb10, or a path i.e. ./bacteria_odb10 or /home/user/bacteria_odb10.
In the former case, which is the recommended usage, BUSCO will automatically download and version the corresponding dataset.
You can get the list of possible datasets by running "busco --list-datasets" see "www.busco.ezlab.org/list_of_lineages.html" and "www.busco.ezlab.org/busco_userguide.html#lineage-datasets".

output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- short_summary_txt:
type: txt
description: Contains a plain text summary of the results in BUSCO notation.
pattern: "./short_summary.*.txt"
- short_summary_json:
type: json
description: Contains a summary of the results in JSON form.
pattern: "./short_summary.*.json"
- run_full_table:
type: tsv
description: Contains the complete results in a tabular format with scores and lengths of BUSCO matches, and coordinates (for genome mode) or gene/protein IDs (for transcriptome or protein mode).
pattern: "./run_*/full_table.tsv"
- run_short_summary_txt:
type: txt
description: Contains a plain text summary of the results in BUSCO notation.
pattern: "./run_*/short_summary.*.txt"
- run_short_summary_json:
type: json
description: Contains a summary of the results in JSON form.
pattern: "./run_*/short_summary.*.json"
- missing_busco_list:
type: tsv
description: Contains a list of missing BUSCOs.
pattern: "./run_*/missing_busco_list.tsv"
- hmmer_output:
type: out
description: Tabular format HMMER output of searches with BUSCO HMMs.
pattern: "./run_*/missing_busco_list.tsv"
- blast_coordinates:
type: tsv
description: locations of BUSCO matches (eukaryotic genome).
pattern: "./run_*/blast_output/coordinates.tsv"
- tblastn:
type: tsv
description: Tabular tBLASTn results.
pattern: "./run_*/blast_output/tblastn.tsv"
- blast_sequences:
type: fasta
description: Sequences having blast results.
pattern: "./run_*/blast_output/sequences/*.temp"
- single_copy_busco_sequences:
type: fasta
description: FASTA format file for each BUSCO gene identified. ".faa" files contain protein sequences, ".fna" files (where created) contain coding sequences.
pattern: "*.{fna,faa}"
- multi_copy_busco_sequences:
type: fasta
description: FASTA format file for each BUSCO gene identified. ".faa" files contain protein sequences, ".fna" files (where created) contain coding sequences.
pattern: "*.{fna,faa}"
- fragmented_busco_sequences:
type: fasta
description: FASTA format file for each BUSCO gene identified. ".faa" files contain protein sequences, ".fna" files (where created) contain coding sequences.
pattern: "*.{fna,faa}"
- predicted_fna:
type: fna
description: A nucleotide file for each predicted gene
pattern: "./prodigal_output/predicted_genes/predicted.fna"
- predicted_faa:
type: faa
description: A protein file for each predicted gene
pattern: "./prodigal_output/predicted_genes/predicted.faa"
- translated_sequences:
type: faa
description: |
Six frame translations of each transcript made by the transcriptome mode.
It is a naive translation, ignoring start and stop codons only in order to apply hmmsearch and do not represent proteins.
pattern: "./translated_sequences/*.faa"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"

authors:
- "@nvlachos"
- "@jvhagey"
- "@priyanka-surana"
4 changes: 4 additions & 0 deletions tests/config/pytest_modules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,10 @@ bracken/bracken:
- modules/bracken/bracken/**
- tests/modules/bracken/bracken/**

busco:
- modules/busco/**
- tests/modules/busco/**

bwa/aln:
- modules/bwa/aln/**
- tests/modules/bwa/aln/**
Expand Down
33 changes: 33 additions & 0 deletions tests/modules/busco/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/usr/bin/env nextflow

nextflow.enable.dsl = 2

include { BUSCO } from '../../../modules/busco/main.nf'

/*workflow test_busco_genome_augustus {
input = [ [ id:'test', mode:"genome", lineage:"bacteroidales_odb10"],
file(params.test_data['bacteroides_fragilis']['genome']['genome_fna_gz'], checkIfExists: true) ]

BUSCO(input)
}*/

workflow test_busco_transcriptome {
input = [ [ id:'test', mode:"transcriptome", lineage:""],
file(params.test_data['bacteroides_fragilis']['illumina']['test1_contigs_fa_gz'], checkIfExists: true) ]

BUSCO(input)
}

workflow test_busco_protein {
input = [ [ id:'test', mode:"protein", lineage:"saccharomycetes_odb10"],
file(params.test_data['candidatus_portiera_aleyrodidarum']['genome']['proteome_fasta'], checkIfExists: true) ]

BUSCO(input)
}

workflow test_busco_genome {
input = [ [ id:'test', mode:"genome", lineage:"bacteroidales_odb10"],
file(params.test_data['bacteroides_fragilis']['genome']['genome_fna_gz'], checkIfExists: true) ]

BUSCO(input)
}
4 changes: 4 additions & 0 deletions tests/modules/busco/nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
process {

publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }
}
Loading