Skip to content

Commit

Permalink
Merge pull request #179 from skrakau/update_busco_5.1.0
Browse files Browse the repository at this point in the history
Update to busco 5.1.0 and enable automated lineage selection
  • Loading branch information
skrakau authored Apr 26, 2021
2 parents a774b80 + 5e9c00b commit af7c17d
Show file tree
Hide file tree
Showing 16 changed files with 501 additions and 138 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### `Added`

- [#179](https://github.com/nf-core/mag/pull/179) - Add BUSCO automated lineage selection functionality (new default). The pameter `--busco_auto_lineage_prok` can be used to only consider prokaryotes and the parameter `--busco_download_path` to run BUSCO in `offline` mode.

### `Changed`

- [#162](https://github.com/nf-core/mag/pull/162) - Switch to DSL2
Expand All @@ -17,6 +19,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#162](https://github.com/nf-core/mag/pull/162) - Update `FastQC` from version `0.11.8` to `0.11.9`
- [#172](https://github.com/nf-core/mag/pull/172) - Compressed discarded MetaBAT2 output files
- [#176](https://github.com/nf-core/mag/pull/176) - Update CAT DB link
- [#179](https://github.com/nf-core/mag/pull/179) - Update `BUSCO` from version `4.1.4` to `5.1.0`
- [#179](https://github.com/nf-core/mag/pull/179) - By default BUSCO now performs automated lineage selection instead of using the bacteria_odb10 lineage as reference. Specific lineage datasets can still be provided via `--busco_reference`.

### `Fixed`

Expand Down
3 changes: 2 additions & 1 deletion assets/multiqc_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ top_modules:
info: 'After trimming and, if requested, contamination removal.'
path_filters:
- '*trimmed*'
- 'busco'
- 'busco':
info: 'assesses genome assembly and annotation completeness with Benchmarking Universal Single-Copy Orthologs. In case BUSCO''s automated lineage selection was used, only generic results for the selected domain are shown and only for genome bins and kept, unbinned contigs for which the BUSCO analysis was successfull, i.e. not for contigs for which no BUSCO genes could be found. Bins for which a specific virus lineage was selected are also not shown.'
- 'quast'


Expand Down
158 changes: 136 additions & 22 deletions bin/summary_busco.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,140 @@
#!/usr/bin/env python

# USAGE: ./summary.busco.py *.txt
# USAGE: ./summary.busco.py -sd <summaries_domain> -ss <summaries_specific> -f <failed_bins>

import re
from sys import argv

# "# Summarized benchmarking in BUSCO notation for file MEGAHIT-testset1.contigs.fa"
# " C:0.0%[S:0.0%,D:0.0%],F:0.0%,M:100.0%,n:148"

regexes = [r"# Summarized benchmarking in BUSCO notation for file (\S+)", r" C:(\S+)%\[S:",
r"%\[S:(\S+)%,D:", r"%,D:(\S+)%\],F:", r"%\],F:(\S+)%,M:", r"%,M:(\S+)%,n:", r"%,n:(\S+)"]
columns = ["GenomeBin", "%Complete", "%Complete and single-copy",
"%Complete and duplicated", "%Fragmented", "%Missing", "Total number"]

# Search each file using its regex
print("\t".join(columns))
for FILE in argv[1:]:
with open(FILE) as x:
results = []
TEXT = x.read()
for REGEX in regexes:
match = re.search(REGEX, TEXT)
if match:
results.append(match.group(1))
print("\t".join(results))
import sys
import argparse
import os.path
import pandas as pd

def parse_args(args=None):
parser = argparse.ArgumentParser()
parser.add_argument('-a', '--auto', default=False, action='store_true', help="BUSCO run in auto lineage selection mode.")
parser.add_argument('-sd', "--summaries_domain", nargs="+", metavar='FILE', help="List of BUSCO summary files for domains.")
parser.add_argument('-ss', "--summaries_specific", nargs="+", metavar='FILE', help="List of BUSCO summary files for specific lineages.")
parser.add_argument('-f', "--failed_bins", nargs="+", metavar='FILE', help="List of files containing bin name for which BUSCO analysis failed.")
parser.add_argument('-o', "--out", required=True, metavar='FILE', type=argparse.FileType('w'), help="Output file containing final BUSCO summary.")
return parser.parse_args(args)


def main(args=None):
args = parse_args(args)

if not args.summaries_domain and not args.summaries_specific and not args.failed_bins:
sys.exit("Either --summaries_domain, --summaries_specific or --failed_bins must be specified!")

# "# Summarized benchmarking in BUSCO notation for file /path/to/MEGAHIT-testset1.contigs.fa"
# " C:0.0%[S:0.0%,D:0.0%],F:0.0%,M:100.0%,n:148"

regexes = [r"# Summarized benchmarking in BUSCO notation for file (\S+)", r"# The lineage dataset is: (\S+) \(", r" C:(\S+)%\[S:",
r"%\[S:(\S+)%,D:", r"%,D:(\S+)%\],F:", r"%\],F:(\S+)%,M:", r"%,M:(\S+)%,n:", r"%,n:(\S+)"]
columns_domain = ["GenomeBin", \
"Domain", \
"%Complete (domain)", \
"%Complete and single-copy (domain)", \
"%Complete and duplicated (domain)", \
"%Fragmented (domain)", \
"%Missing (domain)", \
"Total number (domain)"]
columns_specific = ["GenomeBin", \
"Specific lineage dataset", \
"%Complete (specific)", \
"%Complete and single-copy (specific)", \
"%Complete and duplicated (specific)", \
"%Fragmented (specific)", \
"%Missing (specific)", \
"Total number (specific)"]

if args.auto:
columns = ["GenomeBin", \
"Domain", \
"%Complete (domain)", \
"%Complete and single-copy (domain)", \
"%Complete and duplicated (domain)", \
"%Fragmented (domain)", \
"%Missing (domain)", \
"Total number (domain)", \
"Specific lineage dataset", \
"%Complete (specific)", \
"%Complete and single-copy (specific)", \
"%Complete and duplicated (specific)", \
"%Fragmented (specific)", \
"%Missing (specific)", \
"Total number (specific)"]
else:
columns = ["GenomeBin", \
"Specific lineage dataset", \
"%Complete (specific)", \
"%Complete and single-copy (specific)", \
"%Complete and duplicated (specific)", \
"%Fragmented (specific)", \
"%Missing (specific)", \
"Total number (specific)"]

# Search each summary file using its regex
results_domain = []
if args.summaries_domain:
for file in args.summaries_domain:
with open(file) as infile:
results = []
text = infile.read()
for index, regex in enumerate(regexes):
match = re.search(regex, text)
if match:
if index == 0:
results.append(os.path.basename(match.group(1)))
else:
results.append(match.group(1))
results_domain.append(results)
df_domain = pd.DataFrame(results_domain, columns=columns_domain)

results_specific = []
if args.summaries_specific:
for file in args.summaries_specific:
with open(file) as infile:
results = []
text = infile.read()
for index, regex in enumerate(regexes):
match = re.search(regex, text)
if match:
if index == 0:
results.append(os.path.basename(match.group(1)))
else:
results.append(match.group(1))
results_specific.append(results)
df_specific = pd.DataFrame(results_specific, columns=columns_specific)

# Add entries for bins with failed analysis (for domain and specific lineage where applicable)
failed = []
if args.failed_bins:
for file in args.failed_bins:
with open(file) as infile:
line = infile.readline()
# in case of failed placements domain summary was used and specific part will be filled with NAs when merging
if re.split(r'[\t\n]', line)[1] != "Placements failed":
failed_bin = re.split(r'[\t\n]', line)[0]
if args.auto:
results = [failed_bin, pd.NA, "0.0", "0.0", "0.0", "0.0", "100.0", pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA]
else:
results = [failed_bin, pd.NA, "0.0", "0.0", "0.0", "0.0", "100.0", pd.NA]
failed.append(results)
df_failed = pd.DataFrame(failed, columns=columns)

# merge results
if args.auto:
df_final = df_domain\
.merge(df_specific, on="GenomeBin", how='outer')\
.append(df_failed)
# check if 'Domain' is 'NA', but 'Specific lineage dataset' given -> 'Viruses'
df_final.loc[pd.isna(df_final['Domain']) & pd.notna(df_final['Specific lineage dataset']), 'Domain'] = "Viruses"

else:
df_final = df_specific\
.append(df_failed)

df_final.to_csv(args.out, sep="\t", index=False)


if __name__ == "__main__":
sys.exit(main())
4 changes: 4 additions & 0 deletions conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ process {
memory = { check_max (20.GB * task.attempt, 'memory' ) }
time = { check_max (8.h * task.attempt, 'time' ) }
}
withName: BUSCO {
cpus = { check_max (8 * task.attempt, 'cpus' ) }
memory = { check_max (20.GB * task.attempt, 'memory' ) }
}
withName: GET_SOFTWARE_VERSIONS {
cache = false
}
Expand Down
3 changes: 3 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,9 @@ params {
'busco' {
publish_dir = "GenomeBinning/QC/BUSCO"
}
'busco_save_download' {
publish_dir = "GenomeBinning/QC/BUSCO"
}
'busco_plot' {
publish_dir = "GenomeBinning/QC/BUSCO"
}
Expand Down
22 changes: 14 additions & 8 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -249,26 +249,32 @@ Files in these two folders contain all contigs of an assembly.

### QC for metagenome assembled genomes with BUSCO

[BUSCO](https://busco.ezlab.org/) is a tool used to assess the completeness of a genome assembly. It is run on all the genome bins and high quality contigs obtained by MetaBAT2.
[BUSCO](https://busco.ezlab.org/) is a tool used to assess the completeness of a genome assembly. It is run on all the genome bins and high quality contigs obtained by MetaBAT2. By default, BUSCO is run in automated lineage selection mode in which it first tries to select the domain and then a more specific lineage based on phylogenetic placement. If available, result files for both the selected domain lineage and the selected more specific lineage are placed in the output directory. If a lineage dataset is specified already with `--busco_reference`, only results for this specific lineage will be generated.

**Output files:**

* `GenomeBinning/QC/BUSCO/`
* `[assembler]-[bin]_busco.log`: BUSCO log file
* `[assembler]-[bin]_buscos.fna.gz`: Nucleotide sequence of all identified BUSCOs
* `[assembler]-[bin]_buscos.faa.gz`: Aminoacid sequence of all identified BUSCOs
* `[assembler]-[bin]_busco.log`: Log file containing the standard output of BUSCO.
* `[assembler]-[bin]_busco.err`: File containing potential error messages returned from BUSCO.
* `short_summary.domain.[lineage].[assembler]-[bin].txt`: BUSCO summary of the results for the selected domain when run in automated lineage selection mode. Not available for bins for which a viral lineage was selected.
* `short_summary.specific_lineage.[lineage].[assembler]-[bin].txt`: BUSCO summary of the results in case a more specific lineage than the domain could be selected or for the lineage provided via `--busco_reference`.
* `[assembler]-[bin]_buscos.[lineage].fna.gz`: Nucleotide sequence of all identified BUSCOs for used lineages (domain or specific).
* `[assembler]-[bin]_buscos.[lineage].faa.gz`: Aminoacid sequence of all identified BUSCOs for used lineages (domain or specific).

If the parameter `--save_busco_reference` is set the used BUSCO reference is stored.
If the parameter `--save_busco_reference` is set, additionally the used BUSCO lineage datasets are stored in the output directy.

**Output files:**

* `GenomeBinning/QC/BUSCO/reference/`
* `*.tar.gz`: BUSCO reference file
* `GenomeBinning/QC/BUSCO/`
* `busco_downloads/`: All files and lineage datasets downloaded by BUSCO when run in automated lineage selection mode. (Can currently not be used to reproduce analysis, see the [nf-core/mag website documentation](https://nf-co.re/mag/usage#reproducibility) how to achieve reproducible BUSCO results).
* `reference/*.tar.gz`: BUSCO reference lineage dataset that was provided via `--busco_reference`.

Besides the reference files or output files created by BUSCO, the following summary files will be generated:

**Output files:**

* `GenomeBinning/QC/`
* `busco_summary.txt`: A summary table of the BUSCO results, with % of marker genes found
* `busco_summary.tsv`: A summary table of the BUSCO results, with % of marker genes found. If run in automated lineage selection mode, both the results for the selected domain and for the selected more specific lineage will be given, if available.
* `quast_and_busco_summary.tsv`; Summary of BUSCO and QUAST results

## MultiQC
Expand Down
2 changes: 1 addition & 1 deletion docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ You can fix this by using the prameter `--megahit_fix_cpu_1`. In both cases, do

MetaBAT2 is run by default with a fixed seed within this pipeline, thus producing reproducible results.

To allow also reproducible bin QC with BUSCO, the parameter `--save_busco_reference` can be used to store the reference database. This may be useful since BUSCO datasets are frequently updated and old versions do not always remain accessible.
To allow also reproducible bin QC with BUSCO, run BUSCO providing already downloaded lineage datasets with `--busco_download_path` (BUSCO will be run using automated lineage selection in offline mode) or provide a specific lineage dataset via `--busco_reference` and use the parameter `--save_busco_reference`. This may be useful since BUSCO datasets are frequently updated and old versions do not always remain (easily) accessible.

## Core Nextflow arguments

Expand Down
34 changes: 32 additions & 2 deletions lib/Completion.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@
*/

class Completion {
static void email(workflow, params, summary_params, projectDir, log, multiqc_report=[]) {
static void email(workflow, params, summary_params, projectDir, log, multiqc_report=[], busco_failed_bins = [:]) {

// Set up the e-mail variables
def subject = "[$workflow.manifest.name] Successful: $workflow.runName"
if (busco_failed_bins.size() > 0) {
subject = "[$workflow.manifest.name] Partially successful: For ${busco_failed_bins.size()} bin(s) the BUSCO analysis failed because no genes where found or placements failed: $workflow.runName"
}
if (!workflow.success) {
subject = "[$workflow.manifest.name] FAILED: $workflow.runName"
}
Expand Down Expand Up @@ -40,6 +43,7 @@ class Completion {
email_fields['commandLine'] = workflow.commandLine
email_fields['projectDir'] = workflow.projectDir
email_fields['summary'] = summary << misc_fields
email_fields['busco_failed_bins'] = busco_failed_bins.keySet()

// On success try attach the multiqc report
def mqc_report = null
Expand Down Expand Up @@ -111,8 +115,34 @@ class Completion {
output_tf.withWriter { w -> w << email_txt }
}

static void summary(workflow, params, log) {
static void summary(workflow, params, log, busco_failed_bins = [:]) {
Map colors = Headers.log_colours(params.monochrome_logs)

if (busco_failed_bins.size() > 0) {
def failed_bins_no_genes = ''
def failed_bins_placements_failed = ''
def count_no_genes = 0
def count_placements_failed = 0
for (bin in busco_failed_bins) {
if (bin.value == "No genes"){
count_no_genes += 1
failed_bins_no_genes += " ${bin.key}\n"
}
if (bin.value == "Placements failed"){
count_placements_failed += 1
failed_bins_placements_failed += " ${bin.key}\n"
}
}
if (params.busco_reference)
log.info "-${colors.purple}[$workflow.manifest.name]${colors.yellow} For ${busco_failed_bins.size()} bin(s) BUSCO did not find any matching genes:\n${failed_bins_no_genes}See ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.log for further information.${colors.reset}-"
else {
if (count_no_genes > 0)
log.info "-${colors.purple}[$workflow.manifest.name]${colors.yellow} For ${count_no_genes} bin(s) the BUSCO analysis failed because no BUSCO genes could be found:\n${failed_bins_no_genes}See ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.err and ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.log for further information.${colors.reset}-"
if (count_placements_failed > 0)
log.info "-${colors.purple}[$workflow.manifest.name]${colors.yellow} For ${count_placements_failed} bin(s) the BUSCO analysis using automated lineage selection failed due to failed placements:\n${failed_bins_placements_failed}See ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.err and ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.log for further information. Results for selected domain are still used.${colors.reset}-"
}
}

if (workflow.success) {
if (workflow.stats.ignoredCount == 0) {
log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Pipeline completed successfully${colors.reset}-"
Expand Down
Loading

0 comments on commit af7c17d

Please sign in to comment.