Merge pull request #179 from skrakau/update_busco_5.1.0

Update to busco 5.1.0 and enable automated lineage selection
nf-core · Apr 26, 2021 · af7c17d · af7c17d
2 parents a774b80 + 5e9c00b
commit af7c17d
Show file tree

Hide file tree

Showing 16 changed files with 501 additions and 138 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Added`
 
+- [#179](https://github.com/nf-core/mag/pull/179) - Add BUSCO automated lineage selection functionality (new default). The pameter `--busco_auto_lineage_prok` can be used to only consider prokaryotes and the parameter `--busco_download_path` to run BUSCO in `offline` mode.
+
 ### `Changed`
 
 - [#162](https://github.com/nf-core/mag/pull/162) - Switch to DSL2
@@ -17,6 +19,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#162](https://github.com/nf-core/mag/pull/162) - Update `FastQC` from version `0.11.8` to `0.11.9`
 - [#172](https://github.com/nf-core/mag/pull/172) - Compressed discarded MetaBAT2 output files
 - [#176](https://github.com/nf-core/mag/pull/176) - Update CAT DB link
+- [#179](https://github.com/nf-core/mag/pull/179) - Update `BUSCO` from version `4.1.4` to `5.1.0`
+- [#179](https://github.com/nf-core/mag/pull/179) - By default BUSCO now performs automated lineage selection instead of using the bacteria_odb10 lineage as reference. Specific lineage datasets can still be provided via `--busco_reference`.
 
 ### `Fixed`
 

diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml
@@ -23,7 +23,8 @@ top_modules:
     info: 'After trimming and, if requested, contamination removal.'
     path_filters:
         - '*trimmed*'
-- 'busco'
+- 'busco':
+    info: 'assesses genome assembly and annotation completeness with Benchmarking Universal Single-Copy Orthologs. In case BUSCO''s automated lineage selection was used, only generic results for the selected domain are shown and only for genome bins and kept, unbinned contigs for which the BUSCO analysis was successfull, i.e. not for contigs for which no BUSCO genes could be found. Bins for which a specific virus lineage was selected are also not shown.'
 - 'quast'
 
 

diff --git a/bin/summary_busco.py b/bin/summary_busco.py
@@ -1,26 +1,140 @@
 #!/usr/bin/env python
 
-# USAGE: ./summary.busco.py *.txt
+# USAGE: ./summary.busco.py -sd <summaries_domain> -ss <summaries_specific> -f <failed_bins>
 
 import re
-from sys import argv
-
-# "# Summarized benchmarking in BUSCO notation for file MEGAHIT-testset1.contigs.fa"
-# "	C:0.0%[S:0.0%,D:0.0%],F:0.0%,M:100.0%,n:148"
-
-regexes = [r"# Summarized benchmarking in BUSCO notation for file (\S+)", r"	C:(\S+)%\[S:",
-           r"%\[S:(\S+)%,D:", r"%,D:(\S+)%\],F:", r"%\],F:(\S+)%,M:", r"%,M:(\S+)%,n:", r"%,n:(\S+)"]
-columns = ["GenomeBin", "%Complete", "%Complete and single-copy",
-           "%Complete and duplicated", "%Fragmented", "%Missing", "Total number"]
-
-# Search each file using its regex
-print("\t".join(columns))
-for FILE in argv[1:]:
-    with open(FILE) as x:
-        results = []
-        TEXT = x.read()
-        for REGEX in regexes:
-            match = re.search(REGEX, TEXT)
-            if match:
-                results.append(match.group(1))
-        print("\t".join(results))
+import sys
+import argparse
+import os.path
+import pandas as pd
+
+def parse_args(args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-a', '--auto', default=False, action='store_true', help="BUSCO run in auto lineage selection mode.")
+    parser.add_argument('-sd', "--summaries_domain", nargs="+", metavar='FILE', help="List of BUSCO summary files for domains.")
+    parser.add_argument('-ss', "--summaries_specific", nargs="+", metavar='FILE', help="List of BUSCO summary files for specific lineages.")
+    parser.add_argument('-f', "--failed_bins", nargs="+", metavar='FILE', help="List of files containing bin name for which BUSCO analysis failed.")
+    parser.add_argument('-o', "--out", required=True, metavar='FILE', type=argparse.FileType('w'), help="Output file containing final BUSCO summary.")
+    return parser.parse_args(args)
+
+
+def main(args=None):
+    args = parse_args(args)
+
+    if not args.summaries_domain and not args.summaries_specific and not args.failed_bins:
+        sys.exit("Either --summaries_domain, --summaries_specific or --failed_bins must be specified!")
+
+    # "# Summarized benchmarking in BUSCO notation for file /path/to/MEGAHIT-testset1.contigs.fa"
+    # "	C:0.0%[S:0.0%,D:0.0%],F:0.0%,M:100.0%,n:148"
+
+    regexes = [r"# Summarized benchmarking in BUSCO notation for file (\S+)", r"# The lineage dataset is: (\S+) \(", r"	C:(\S+)%\[S:",
+            r"%\[S:(\S+)%,D:", r"%,D:(\S+)%\],F:", r"%\],F:(\S+)%,M:", r"%,M:(\S+)%,n:", r"%,n:(\S+)"]
+    columns_domain = ["GenomeBin", \
+                     "Domain", \
+                     "%Complete (domain)", \
+                     "%Complete and single-copy (domain)", \
+                     "%Complete and duplicated (domain)", \
+                     "%Fragmented (domain)", \
+                     "%Missing (domain)", \
+                     "Total number (domain)"]
+    columns_specific = ["GenomeBin", \
+                        "Specific lineage dataset", \
+                        "%Complete (specific)", \
+                        "%Complete and single-copy (specific)", \
+                        "%Complete and duplicated (specific)", \
+                        "%Fragmented (specific)", \
+                        "%Missing (specific)", \
+                        "Total number (specific)"]
+
+    if args.auto:
+        columns = ["GenomeBin", \
+                        "Domain", \
+                        "%Complete (domain)", \
+                        "%Complete and single-copy (domain)", \
+                        "%Complete and duplicated (domain)", \
+                        "%Fragmented (domain)", \
+                        "%Missing (domain)", \
+                        "Total number (domain)", \
+                        "Specific lineage dataset", \
+                        "%Complete (specific)", \
+                        "%Complete and single-copy (specific)", \
+                        "%Complete and duplicated (specific)", \
+                        "%Fragmented (specific)", \
+                        "%Missing (specific)", \
+                        "Total number (specific)"]
+    else:
+        columns = ["GenomeBin", \
+                        "Specific lineage dataset", \
+                        "%Complete (specific)", \
+                        "%Complete and single-copy (specific)", \
+                        "%Complete and duplicated (specific)", \
+                        "%Fragmented (specific)", \
+                        "%Missing (specific)", \
+                        "Total number (specific)"]
+
+    # Search each summary file using its regex
+    results_domain = []
+    if args.summaries_domain:
+        for file in args.summaries_domain:
+            with open(file) as infile:
+                results = []
+                text = infile.read()
+                for index, regex in enumerate(regexes):
+                    match = re.search(regex, text)
+                    if match:
+                        if index == 0:
+                            results.append(os.path.basename(match.group(1)))
+                        else:
+                            results.append(match.group(1))
+                results_domain.append(results)
+    df_domain = pd.DataFrame(results_domain, columns=columns_domain)
+
+    results_specific = []
+    if args.summaries_specific:
+        for file in args.summaries_specific:
+            with open(file) as infile:
+                results = []
+                text = infile.read()
+                for index, regex in enumerate(regexes):
+                    match = re.search(regex, text)
+                    if match:
+                        if index == 0:
+                            results.append(os.path.basename(match.group(1)))
+                        else:
+                            results.append(match.group(1))
+                results_specific.append(results)
+    df_specific = pd.DataFrame(results_specific, columns=columns_specific)
+
+    # Add entries for bins with failed analysis (for domain and specific lineage where applicable)
+    failed = []
+    if args.failed_bins:
+        for file in args.failed_bins:
+            with open(file) as infile:
+                line = infile.readline()
+                # in case of failed placements domain summary was used and specific part will be filled with NAs when merging
+                if re.split(r'[\t\n]', line)[1] != "Placements failed":
+                    failed_bin = re.split(r'[\t\n]', line)[0]
+                    if args.auto:
+                        results = [failed_bin, pd.NA, "0.0", "0.0", "0.0", "0.0", "100.0", pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA]
+                    else:
+                        results = [failed_bin, pd.NA, "0.0", "0.0", "0.0", "0.0", "100.0", pd.NA]
+                    failed.append(results)
+    df_failed = pd.DataFrame(failed, columns=columns)
+
+    # merge results
+    if args.auto:
+        df_final = df_domain\
+            .merge(df_specific, on="GenomeBin", how='outer')\
+            .append(df_failed)
+        # check if 'Domain' is 'NA', but 'Specific lineage dataset' given -> 'Viruses'
+        df_final.loc[pd.isna(df_final['Domain']) & pd.notna(df_final['Specific lineage dataset']), 'Domain'] = "Viruses"
+
+    else:
+        df_final = df_specific\
+            .append(df_failed)
+
+    df_final.to_csv(args.out, sep="\t", index=False)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/conf/base.config b/conf/base.config
@@ -106,6 +106,10 @@ process {
     memory = { check_max (20.GB * task.attempt, 'memory' ) }
     time = { check_max (8.h * task.attempt, 'time' ) }
   }
+  withName: BUSCO {
+    cpus = { check_max (8 * task.attempt, 'cpus' ) }
+    memory = { check_max (20.GB * task.attempt, 'memory' ) }
+  }
   withName: GET_SOFTWARE_VERSIONS {
     cache = false
   }

diff --git a/conf/modules.config b/conf/modules.config
@@ -108,6 +108,9 @@ params {
         'busco' {
             publish_dir    = "GenomeBinning/QC/BUSCO"
         }
+        'busco_save_download' {
+            publish_dir    = "GenomeBinning/QC/BUSCO"
+        }
         'busco_plot' {
             publish_dir    = "GenomeBinning/QC/BUSCO"
         }

diff --git a/docs/output.md b/docs/output.md
@@ -249,26 +249,32 @@ Files in these two folders contain all contigs of an assembly.
 
 ### QC for metagenome assembled genomes with BUSCO
 
-[BUSCO](https://busco.ezlab.org/) is a tool used to assess the completeness of a genome assembly. It is run on all the genome bins and high quality contigs obtained by MetaBAT2.
+[BUSCO](https://busco.ezlab.org/) is a tool used to assess the completeness of a genome assembly. It is run on all the genome bins and high quality contigs obtained by MetaBAT2. By default, BUSCO is run in automated lineage selection mode in which it first tries to select the domain and then a more specific lineage based on phylogenetic placement. If available, result files for both the selected domain lineage and the selected more specific lineage are placed in the output directory. If a lineage dataset is specified already with `--busco_reference`, only results for this specific lineage will be generated.
 
 **Output files:**
 
 * `GenomeBinning/QC/BUSCO/`
-  * `[assembler]-[bin]_busco.log`: BUSCO log file
-  * `[assembler]-[bin]_buscos.fna.gz`: Nucleotide sequence of all identified BUSCOs
-  * `[assembler]-[bin]_buscos.faa.gz`: Aminoacid sequence of all identified BUSCOs
+  * `[assembler]-[bin]_busco.log`: Log file containing the standard output of BUSCO.
+  * `[assembler]-[bin]_busco.err`: File containing potential error messages returned from BUSCO.
+  * `short_summary.domain.[lineage].[assembler]-[bin].txt`: BUSCO summary of the results for the selected domain when run in automated lineage selection mode. Not available for bins for which a viral lineage was selected.
+  * `short_summary.specific_lineage.[lineage].[assembler]-[bin].txt`: BUSCO summary of the results in case a more specific lineage than the domain could be selected or for the lineage provided via `--busco_reference`.
+  * `[assembler]-[bin]_buscos.[lineage].fna.gz`: Nucleotide sequence of all identified BUSCOs for used lineages (domain or specific).
+  * `[assembler]-[bin]_buscos.[lineage].faa.gz`: Aminoacid sequence of all identified BUSCOs for used lineages (domain or specific).
 
-If the parameter `--save_busco_reference` is set the used BUSCO reference is stored.
+If the parameter `--save_busco_reference` is set, additionally the used BUSCO lineage datasets are stored in the output directy.
 
 **Output files:**
 
-* `GenomeBinning/QC/BUSCO/reference/`
-  * `*.tar.gz`: BUSCO reference file
+* `GenomeBinning/QC/BUSCO/`
+  * `busco_downloads/`: All files and lineage datasets downloaded by BUSCO when run in automated lineage selection mode. (Can currently not be used to reproduce analysis, see the [nf-core/mag website documentation](https://nf-co.re/mag/usage#reproducibility) how to achieve reproducible BUSCO results).
+  * `reference/*.tar.gz`: BUSCO reference lineage dataset that was provided via `--busco_reference`.
+
+Besides the reference files or output files created by BUSCO, the following summary files will be generated:
 
 **Output files:**
 
 * `GenomeBinning/QC/`
-  * `busco_summary.txt`: A summary table of the BUSCO results, with % of marker genes found
+  * `busco_summary.tsv`: A summary table of the BUSCO results, with % of marker genes found. If run in automated lineage selection mode, both the results for the selected domain and for the selected more specific lineage will be given, if available.
   * `quast_and_busco_summary.tsv`; Summary of BUSCO and QUAST results
 
 ## MultiQC

diff --git a/docs/usage.md b/docs/usage.md
@@ -48,7 +48,7 @@ You can fix this by using the prameter `--megahit_fix_cpu_1`. In both cases, do
 
 MetaBAT2 is run by default with a fixed seed within this pipeline, thus producing reproducible results.
 
-To allow also reproducible bin QC with BUSCO, the parameter `--save_busco_reference` can be used to store the reference database. This may be useful since BUSCO datasets are frequently updated and old versions do not always remain accessible.
+To allow also reproducible bin QC with BUSCO, run BUSCO providing already downloaded lineage datasets with `--busco_download_path` (BUSCO will be run using automated lineage selection in offline mode) or provide a specific lineage dataset via `--busco_reference` and use the parameter `--save_busco_reference`. This may be useful since BUSCO datasets are frequently updated and old versions do not always remain (easily) accessible.
 
 ## Core Nextflow arguments
 

diff --git a/lib/Completion.groovy b/lib/Completion.groovy
@@ -3,10 +3,13 @@
  */
 
 class Completion {
-    static void email(workflow, params, summary_params, projectDir, log, multiqc_report=[]) {
+    static void email(workflow, params, summary_params, projectDir, log, multiqc_report=[], busco_failed_bins = [:]) {
 
         // Set up the e-mail variables
         def subject = "[$workflow.manifest.name] Successful: $workflow.runName"
+        if (busco_failed_bins.size() > 0) {
+            subject = "[$workflow.manifest.name] Partially successful: For ${busco_failed_bins.size()} bin(s) the BUSCO analysis failed because no genes where found or placements failed: $workflow.runName"
+        }
         if (!workflow.success) {
             subject = "[$workflow.manifest.name] FAILED: $workflow.runName"
         }
@@ -40,6 +43,7 @@ class Completion {
         email_fields['commandLine']  = workflow.commandLine
         email_fields['projectDir']   = workflow.projectDir
         email_fields['summary']      = summary << misc_fields
+        email_fields['busco_failed_bins'] = busco_failed_bins.keySet()
 
         // On success try attach the multiqc report
         def mqc_report = null
@@ -111,8 +115,34 @@ class Completion {
         output_tf.withWriter { w -> w << email_txt }
     }
 
-    static void summary(workflow, params, log) {
+    static void summary(workflow, params, log, busco_failed_bins = [:]) {
         Map colors = Headers.log_colours(params.monochrome_logs)
+
+        if (busco_failed_bins.size() > 0) {
+            def failed_bins_no_genes = ''
+            def failed_bins_placements_failed = ''
+            def count_no_genes = 0
+            def count_placements_failed = 0
+            for (bin in busco_failed_bins) {
+                if (bin.value == "No genes"){
+                    count_no_genes += 1
+                    failed_bins_no_genes += "    ${bin.key}\n"
+                }
+                if (bin.value == "Placements failed"){
+                    count_placements_failed += 1
+                    failed_bins_placements_failed += "    ${bin.key}\n"
+                }
+            }
+            if (params.busco_reference)
+                log.info "-${colors.purple}[$workflow.manifest.name]${colors.yellow} For ${busco_failed_bins.size()} bin(s) BUSCO did not find any matching genes:\n${failed_bins_no_genes}See ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.log for further information.${colors.reset}-"
+            else {
+                if (count_no_genes > 0)
+                    log.info "-${colors.purple}[$workflow.manifest.name]${colors.yellow} For ${count_no_genes} bin(s) the BUSCO analysis failed because no BUSCO genes could be found:\n${failed_bins_no_genes}See ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.err and ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.log for further information.${colors.reset}-"
+                if (count_placements_failed > 0)
+                    log.info "-${colors.purple}[$workflow.manifest.name]${colors.yellow} For ${count_placements_failed} bin(s) the BUSCO analysis using automated lineage selection failed due to failed placements:\n${failed_bins_placements_failed}See ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.err and ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.log for further information. Results for selected domain are still used.${colors.reset}-"
+            }
+        }
+
         if (workflow.success) {
             if (workflow.stats.ignoredCount == 0) {
                 log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Pipeline completed successfully${colors.reset}-"