Skip to content

Commit

Permalink
Merge pull request #1239 from priyanka-surana/busco
Browse files Browse the repository at this point in the history
Busco
sateeshperi authored May 6, 2022
2 parents 56e9e6b + 7a85760 commit e13d634
Showing 6 changed files with 748 additions and 0 deletions.
84 changes: 84 additions & 0 deletions modules/busco/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
process BUSCO {
tag "$meta.id"
label 'process_medium'

conda (params.enable_conda ? "bioconda::busco=5.3.2" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/busco:5.3.2--pyhdfd78af_0':
'quay.io/biocontainers/busco:5.3.2--pyhdfd78af_0' }"

input:
tuple val(meta), path('tmp_input/*')
each lineage // Required: lineage to check against, "auto" enables --auto-lineage instead
path busco_lineages_path // Recommended: path to busco lineages - downloads if not set
path config_file // Optional: busco configuration file

output:
tuple val(meta), path("*-busco.batch_summary.txt"), emit: batch_summary
tuple val(meta), path("short_summary.*.txt") , emit: short_summaries_txt, optional: true
tuple val(meta), path("short_summary.*.json") , emit: short_summaries_json, optional: true
tuple val(meta), path("*-busco") , emit: busco_dir
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}-${lineage}"
def busco_config = config_file ? "--config $config_file" : ''
def busco_lineage = lineage.equals('auto') ? '--auto-lineage' : "--lineage_dataset ${lineage}"
def busco_lineage_dir = busco_lineages_path ? "--offline --download_path ${busco_lineages_path}" : ''
"""
# Nextflow changes the container --entrypoint to /bin/bash (container default entrypoint: /usr/local/env-execute)
# Check for container variable initialisation script and source it.
if [ -f "/usr/local/env-activate.sh" ]; then
set +u # Otherwise, errors out because of various unbound variables
. "/usr/local/env-activate.sh"
set -u
fi
# If the augustus config directory is not writable, then copy to writeable area
if [ ! -w "\${AUGUSTUS_CONFIG_PATH}" ]; then
# Create writable tmp directory for augustus
AUG_CONF_DIR=\$( mktemp -d -p \$PWD )
cp -r \$AUGUSTUS_CONFIG_PATH/* \$AUG_CONF_DIR
export AUGUSTUS_CONFIG_PATH=\$AUG_CONF_DIR
echo "New AUGUSTUS_CONFIG_PATH=\${AUGUSTUS_CONFIG_PATH}"
fi
# Ensure the input is uncompressed
INPUT_SEQS=input_seqs
mkdir "\$INPUT_SEQS"
cd "\$INPUT_SEQS"
for FASTA in ../tmp_input/*; do
if [ "\${FASTA##*.}" == 'gz' ]; then
gzip -cdf "\$FASTA" > \$( basename "\$FASTA" .gz )
else
ln -s "\$FASTA" .
fi
done
cd ..
busco \\
--cpu $task.cpus \\
--in "\$INPUT_SEQS" \\
--out ${prefix}-busco \\
$busco_lineage \\
$busco_lineage_dir \\
$busco_config \\
$args
# clean up
rm -rf "\$INPUT_SEQS"
# Move files to avoid staging/publishing issues
mv ${prefix}-busco/batch_summary.txt ${prefix}-busco.batch_summary.txt
mv ${prefix}-busco/*/short_summary.*.{json,txt} . || echo "Short summaries were not available: No genes were found."
cat <<-END_VERSIONS > versions.yml
"${task.process}":
busco: \$( busco --version 2>&1 | sed 's/^BUSCO //' )
END_VERSIONS
"""
}
69 changes: 69 additions & 0 deletions modules/busco/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
name: busco
description: Benchmarking Universal Single Copy Orthologs
keywords:
- quality control
- genome
- transcriptome
- proteome
tools:
- busco:
description: BUSCO provides measures for quantitative assessment of genome assembly, gene set, and transcriptome completeness based on evolutionarily informed expectations of gene content from near-universal single-copy orthologs selected from OrthoDB.
homepage: https://busco.ezlab.org/
documentation: https://busco.ezlab.org/busco_userguide.html
tool_dev_url: https://gitlab.com/ezlab/busco
doi: "10.1007/978-1-4939-9173-0_14"
licence: ["MIT"]

input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- fasta:
type: file
description: Nucleic or amino acid sequence file in FASTA format.
pattern: "*.{fasta,fna,fa,fasta.gz,fna.gz,fa.gz}"
- lineage:
type: value
description: The BUSCO lineage to use, or "auto" to automatically select lineage
- busco_lineages_path:
type: directory
description: Path to local BUSCO lineages directory.
- config_file:
type: file
description: Path to BUSCO config file.

output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- batch_summary:
type: file
description: Summary of all sequence files analyzed
pattern: "*-busco.batch_summary.txt"
- short_summaries_txt:
type: file
description: Short Busco summary in plain text format
pattern: "short_summary.*.txt"
- short_summaries_json:
type: file
description: Short Busco summary in JSON format
pattern: "short_summary.*.json"
- busco_dir:
type: directory
description: BUSCO lineage specific output
pattern: "*-busco"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"

authors:
- "@priyanka-surana"
- "@charles-plessy"
- "@mahesh-panchal"
- "@muffato"
- "@jvhagey"
4 changes: 4 additions & 0 deletions tests/config/pytest_modules.yml
Original file line number Diff line number Diff line change
@@ -337,6 +337,10 @@ bracken/bracken:
- modules/bracken/bracken/**
- tests/modules/bracken/bracken/**

busco:
- modules/busco/**
- tests/modules/busco/**

bwa/aln:
- modules/bwa/aln/**
- tests/modules/bwa/aln/**
404 changes: 404 additions & 0 deletions tests/modules/busco/main.nf

Large diffs are not rendered by default.

28 changes: 28 additions & 0 deletions tests/modules/busco/nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
process {

publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }

withName: 'test_busco_genome_single_fasta:BUSCO' {
ext.args = '--mode genome'
}

withName: 'test_busco_genome_multi_fasta:BUSCO' {
ext.args = '--mode genome'
}

withName: 'test_busco_eukaryote_metaeuk:BUSCO' {
ext.args = '--mode genome'
}

withName: 'test_busco_eukaryote_augustus:BUSCO' {
ext.args = '--mode genome --augustus'
}

withName: 'test_busco_protein:BUSCO' {
ext.args = '--mode proteins'
}

withName: 'test_busco_transcriptome:BUSCO'{
ext.args = '--mode transcriptome'
}
}
159 changes: 159 additions & 0 deletions tests/modules/busco/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
- name: busco test_busco_genome_single_fasta
command: nextflow run tests/modules/busco -entry test_busco_genome_single_fasta -c tests/config/nextflow.config
tags:
- busco
files:
- path: output/busco/short_summary.specific.bacteria_odb10.genome.fna.json
contains:
- "one_line_summary"
- "input_file"
- "mode"
- "dataset"
- path: output/busco/short_summary.specific.bacteria_odb10.genome.fna.txt
contains:
- "BUSCO version"
- "The lineage dataset is"
- "BUSCO was run in mode"
- "Complete BUSCOs"
- "Missing BUSCOs"
- "Dependencies and versions"
- path: output/busco/short_summary.specific.bacteroidetes_odb10.genome.fna.json
contains:
- "one_line_summary"
- "input_file"
- "mode"
- "dataset"
- path: output/busco/short_summary.specific.bacteroidetes_odb10.genome.fna.txt
contains:
- "BUSCO version"
- "The lineage dataset is"
- "BUSCO was run in mode"
- "Complete BUSCOs"
- "Missing BUSCOs"
- "Dependencies and versions"
- path: output/busco/test-bacteria_odb10-busco.batch_summary.txt
md5sum: e50690742e9ae6abdd2bf99334ff9e12
- path: output/busco/test-bacteroidetes_odb10-busco.batch_summary.txt
md5sum: 4c1b2c4317c88398eddc30877ed740d9
- path: output/busco/versions.yml
md5sum: 8aa830f71587d859df35c6cfab59f35d

- name: busco test_busco_genome_multi_fasta
command: nextflow run tests/modules/busco -entry test_busco_genome_multi_fasta -c tests/config/nextflow.config
tags:
- busco
files:
- path: output/busco/short_summary.specific.bacteria_odb10.genome.fasta.json
contains:
- "one_line_summary"
- "input_file"
- "mode"
- "dataset"
- path: output/busco/short_summary.specific.bacteria_odb10.genome.fasta.txt
contains:
- "BUSCO version"
- "The lineage dataset is"
- "BUSCO was run in mode"
- "Complete BUSCOs"
- "Missing BUSCOs"
- "Dependencies and versions"
- path: output/busco/short_summary.specific.bacteria_odb10.genome.fna.json
contains:
- "one_line_summary"
- "input_file"
- "mode"
- "dataset"
- path: output/busco/short_summary.specific.bacteria_odb10.genome.fna.txt
contains:
- "BUSCO version"
- "The lineage dataset is"
- "BUSCO was run in mode"
- "Complete BUSCOs"
- "Missing BUSCOs"
- "Dependencies and versions"
- path: output/busco/test-bacteria_odb10-busco.batch_summary.txt
md5sum: 5360dfe83bec1f5741ee115e53e6b517
- path: output/busco/versions.yml
md5sum: 9a959eb0a1f765777dff1ea2f5c139c0

- name: busco test_busco_eukaryote_metaeuk
command: nextflow run tests/modules/busco -entry test_busco_eukaryote_metaeuk -c tests/config/nextflow.config
tags:
- busco
files:
- path: output/busco/short_summary.specific.eukaryota_odb10.genome.fasta.json
contains:
- "one_line_summary"
- "input_file"
- "mode"
- "dataset"
- path: output/busco/short_summary.specific.eukaryota_odb10.genome.fasta.txt
contains:
- "BUSCO version"
- "The lineage dataset is"
- "BUSCO was run in mode"
- "Complete BUSCOs"
- "Missing BUSCOs"
- "Dependencies and versions"
- path: output/busco/test-eukaryota_odb10-busco.batch_summary.txt
md5sum: a70806f99ba5706d7353d3353b3f1d2b
- path: output/busco/versions.yml
md5sum: 34a808c257e6db1b0456f3b4372bc477

- name: busco test_busco_eukaryote_augustus
command: nextflow run tests/modules/busco -entry test_busco_eukaryote_augustus -c tests/config/nextflow.config
tags:
- busco
files:
- path: output/busco/test-eukaryota_odb10-busco.batch_summary.txt
md5sum: 660393dd43cd6a093b952d4b8ad41e40
- path: output/busco/versions.yml
md5sum: 2caac915461410b16a1524ac064cd0df

- name: busco test_busco_protein
command: nextflow run tests/modules/busco -entry test_busco_protein -c tests/config/nextflow.config
tags:
- busco
files:
- path: output/busco/short_summary.specific.bacteria_odb10.proteome.fasta.json
contains:
- "one_line_summary"
- "input_file"
- "mode"
- "dataset"
- path: output/busco/short_summary.specific.bacteria_odb10.proteome.fasta.txt
contains:
- "BUSCO version"
- "The lineage dataset is"
- "BUSCO was run in mode"
- "Complete BUSCOs"
- "Missing BUSCOs"
- "Dependencies and versions"
- path: output/busco/test-bacteria_odb10-busco.batch_summary.txt
md5sum: fd3b4e30ce74d1fcb95d6286d6e2049f
- path: output/busco/versions.yml
md5sum: d7392261a57960a7e6aea609dce824f5

- name: busco test_busco_transcriptome
command: nextflow run tests/modules/busco -entry test_busco_transcriptome -c tests/config/nextflow.config
tags:
- busco
files:
- path: output/busco/short_summary.specific.bacteria_odb10.test1.contigs.fa.json
contains:
- "one_line_summary"
- "input_file"
- "mode"
- "dataset"
- path: output/busco/short_summary.specific.bacteria_odb10.test1.contigs.fa.txt
contains:
- "BUSCO version"
- "The lineage dataset is"
- "BUSCO was run in mode"
- "Complete BUSCOs"
- "Missing BUSCOs"
- "Dependencies and versions"
- path: output/busco/test-bacteria_odb10-busco.batch_summary.txt
md5sum: 9a176cafe66ac0adca89dc34ad2be13f
- path: output/busco/versions.yml
md5sum: 30eacbc7df70f6b1e72e0a7b6d02a7e1

0 comments on commit e13d634

Please sign in to comment.