diff --git a/CHANGELOG.md b/CHANGELOG.md index 22556343..a468bb26 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` +- [#179](https://github.com/nf-core/mag/pull/179) - Add BUSCO automated lineage selection functionality (new default). The pameter `--busco_auto_lineage_prok` can be used to only consider prokaryotes and the parameter `--busco_download_path` to run BUSCO in `offline` mode. + ### `Changed` - [#162](https://github.com/nf-core/mag/pull/162) - Switch to DSL2 @@ -17,6 +19,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#162](https://github.com/nf-core/mag/pull/162) - Update `FastQC` from version `0.11.8` to `0.11.9` - [#172](https://github.com/nf-core/mag/pull/172) - Compressed discarded MetaBAT2 output files - [#176](https://github.com/nf-core/mag/pull/176) - Update CAT DB link +- [#179](https://github.com/nf-core/mag/pull/179) - Update `BUSCO` from version `4.1.4` to `5.1.0` +- [#179](https://github.com/nf-core/mag/pull/179) - By default BUSCO now performs automated lineage selection instead of using the bacteria_odb10 lineage as reference. Specific lineage datasets can still be provided via `--busco_reference`. ### `Fixed` diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index a2b32ccf..1fbe2340 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -23,7 +23,8 @@ top_modules: info: 'After trimming and, if requested, contamination removal.' path_filters: - '*trimmed*' -- 'busco' +- 'busco': + info: 'assesses genome assembly and annotation completeness with Benchmarking Universal Single-Copy Orthologs. In case BUSCO''s automated lineage selection was used, only generic results for the selected domain are shown and only for genome bins and kept, unbinned contigs for which the BUSCO analysis was successfull, i.e. not for contigs for which no BUSCO genes could be found. Bins for which a specific virus lineage was selected are also not shown.' - 'quast' diff --git a/bin/summary_busco.py b/bin/summary_busco.py index f3dd2eba..e5d80887 100755 --- a/bin/summary_busco.py +++ b/bin/summary_busco.py @@ -1,26 +1,140 @@ #!/usr/bin/env python -# USAGE: ./summary.busco.py *.txt +# USAGE: ./summary.busco.py -sd -ss -f import re -from sys import argv - -# "# Summarized benchmarking in BUSCO notation for file MEGAHIT-testset1.contigs.fa" -# " C:0.0%[S:0.0%,D:0.0%],F:0.0%,M:100.0%,n:148" - -regexes = [r"# Summarized benchmarking in BUSCO notation for file (\S+)", r" C:(\S+)%\[S:", - r"%\[S:(\S+)%,D:", r"%,D:(\S+)%\],F:", r"%\],F:(\S+)%,M:", r"%,M:(\S+)%,n:", r"%,n:(\S+)"] -columns = ["GenomeBin", "%Complete", "%Complete and single-copy", - "%Complete and duplicated", "%Fragmented", "%Missing", "Total number"] - -# Search each file using its regex -print("\t".join(columns)) -for FILE in argv[1:]: - with open(FILE) as x: - results = [] - TEXT = x.read() - for REGEX in regexes: - match = re.search(REGEX, TEXT) - if match: - results.append(match.group(1)) - print("\t".join(results)) +import sys +import argparse +import os.path +import pandas as pd + +def parse_args(args=None): + parser = argparse.ArgumentParser() + parser.add_argument('-a', '--auto', default=False, action='store_true', help="BUSCO run in auto lineage selection mode.") + parser.add_argument('-sd', "--summaries_domain", nargs="+", metavar='FILE', help="List of BUSCO summary files for domains.") + parser.add_argument('-ss', "--summaries_specific", nargs="+", metavar='FILE', help="List of BUSCO summary files for specific lineages.") + parser.add_argument('-f', "--failed_bins", nargs="+", metavar='FILE', help="List of files containing bin name for which BUSCO analysis failed.") + parser.add_argument('-o', "--out", required=True, metavar='FILE', type=argparse.FileType('w'), help="Output file containing final BUSCO summary.") + return parser.parse_args(args) + + +def main(args=None): + args = parse_args(args) + + if not args.summaries_domain and not args.summaries_specific and not args.failed_bins: + sys.exit("Either --summaries_domain, --summaries_specific or --failed_bins must be specified!") + + # "# Summarized benchmarking in BUSCO notation for file /path/to/MEGAHIT-testset1.contigs.fa" + # " C:0.0%[S:0.0%,D:0.0%],F:0.0%,M:100.0%,n:148" + + regexes = [r"# Summarized benchmarking in BUSCO notation for file (\S+)", r"# The lineage dataset is: (\S+) \(", r" C:(\S+)%\[S:", + r"%\[S:(\S+)%,D:", r"%,D:(\S+)%\],F:", r"%\],F:(\S+)%,M:", r"%,M:(\S+)%,n:", r"%,n:(\S+)"] + columns_domain = ["GenomeBin", \ + "Domain", \ + "%Complete (domain)", \ + "%Complete and single-copy (domain)", \ + "%Complete and duplicated (domain)", \ + "%Fragmented (domain)", \ + "%Missing (domain)", \ + "Total number (domain)"] + columns_specific = ["GenomeBin", \ + "Specific lineage dataset", \ + "%Complete (specific)", \ + "%Complete and single-copy (specific)", \ + "%Complete and duplicated (specific)", \ + "%Fragmented (specific)", \ + "%Missing (specific)", \ + "Total number (specific)"] + + if args.auto: + columns = ["GenomeBin", \ + "Domain", \ + "%Complete (domain)", \ + "%Complete and single-copy (domain)", \ + "%Complete and duplicated (domain)", \ + "%Fragmented (domain)", \ + "%Missing (domain)", \ + "Total number (domain)", \ + "Specific lineage dataset", \ + "%Complete (specific)", \ + "%Complete and single-copy (specific)", \ + "%Complete and duplicated (specific)", \ + "%Fragmented (specific)", \ + "%Missing (specific)", \ + "Total number (specific)"] + else: + columns = ["GenomeBin", \ + "Specific lineage dataset", \ + "%Complete (specific)", \ + "%Complete and single-copy (specific)", \ + "%Complete and duplicated (specific)", \ + "%Fragmented (specific)", \ + "%Missing (specific)", \ + "Total number (specific)"] + + # Search each summary file using its regex + results_domain = [] + if args.summaries_domain: + for file in args.summaries_domain: + with open(file) as infile: + results = [] + text = infile.read() + for index, regex in enumerate(regexes): + match = re.search(regex, text) + if match: + if index == 0: + results.append(os.path.basename(match.group(1))) + else: + results.append(match.group(1)) + results_domain.append(results) + df_domain = pd.DataFrame(results_domain, columns=columns_domain) + + results_specific = [] + if args.summaries_specific: + for file in args.summaries_specific: + with open(file) as infile: + results = [] + text = infile.read() + for index, regex in enumerate(regexes): + match = re.search(regex, text) + if match: + if index == 0: + results.append(os.path.basename(match.group(1))) + else: + results.append(match.group(1)) + results_specific.append(results) + df_specific = pd.DataFrame(results_specific, columns=columns_specific) + + # Add entries for bins with failed analysis (for domain and specific lineage where applicable) + failed = [] + if args.failed_bins: + for file in args.failed_bins: + with open(file) as infile: + line = infile.readline() + # in case of failed placements domain summary was used and specific part will be filled with NAs when merging + if re.split(r'[\t\n]', line)[1] != "Placements failed": + failed_bin = re.split(r'[\t\n]', line)[0] + if args.auto: + results = [failed_bin, pd.NA, "0.0", "0.0", "0.0", "0.0", "100.0", pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA] + else: + results = [failed_bin, pd.NA, "0.0", "0.0", "0.0", "0.0", "100.0", pd.NA] + failed.append(results) + df_failed = pd.DataFrame(failed, columns=columns) + + # merge results + if args.auto: + df_final = df_domain\ + .merge(df_specific, on="GenomeBin", how='outer')\ + .append(df_failed) + # check if 'Domain' is 'NA', but 'Specific lineage dataset' given -> 'Viruses' + df_final.loc[pd.isna(df_final['Domain']) & pd.notna(df_final['Specific lineage dataset']), 'Domain'] = "Viruses" + + else: + df_final = df_specific\ + .append(df_failed) + + df_final.to_csv(args.out, sep="\t", index=False) + + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/conf/base.config b/conf/base.config index 5a9faa5d..7a50c4b2 100644 --- a/conf/base.config +++ b/conf/base.config @@ -106,6 +106,10 @@ process { memory = { check_max (20.GB * task.attempt, 'memory' ) } time = { check_max (8.h * task.attempt, 'time' ) } } + withName: BUSCO { + cpus = { check_max (8 * task.attempt, 'cpus' ) } + memory = { check_max (20.GB * task.attempt, 'memory' ) } + } withName: GET_SOFTWARE_VERSIONS { cache = false } diff --git a/conf/modules.config b/conf/modules.config index b440cfba..c488f6c7 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -108,6 +108,9 @@ params { 'busco' { publish_dir = "GenomeBinning/QC/BUSCO" } + 'busco_save_download' { + publish_dir = "GenomeBinning/QC/BUSCO" + } 'busco_plot' { publish_dir = "GenomeBinning/QC/BUSCO" } diff --git a/docs/output.md b/docs/output.md index ecd83f4a..aa142cfa 100644 --- a/docs/output.md +++ b/docs/output.md @@ -249,26 +249,32 @@ Files in these two folders contain all contigs of an assembly. ### QC for metagenome assembled genomes with BUSCO -[BUSCO](https://busco.ezlab.org/) is a tool used to assess the completeness of a genome assembly. It is run on all the genome bins and high quality contigs obtained by MetaBAT2. +[BUSCO](https://busco.ezlab.org/) is a tool used to assess the completeness of a genome assembly. It is run on all the genome bins and high quality contigs obtained by MetaBAT2. By default, BUSCO is run in automated lineage selection mode in which it first tries to select the domain and then a more specific lineage based on phylogenetic placement. If available, result files for both the selected domain lineage and the selected more specific lineage are placed in the output directory. If a lineage dataset is specified already with `--busco_reference`, only results for this specific lineage will be generated. **Output files:** * `GenomeBinning/QC/BUSCO/` - * `[assembler]-[bin]_busco.log`: BUSCO log file - * `[assembler]-[bin]_buscos.fna.gz`: Nucleotide sequence of all identified BUSCOs - * `[assembler]-[bin]_buscos.faa.gz`: Aminoacid sequence of all identified BUSCOs + * `[assembler]-[bin]_busco.log`: Log file containing the standard output of BUSCO. + * `[assembler]-[bin]_busco.err`: File containing potential error messages returned from BUSCO. + * `short_summary.domain.[lineage].[assembler]-[bin].txt`: BUSCO summary of the results for the selected domain when run in automated lineage selection mode. Not available for bins for which a viral lineage was selected. + * `short_summary.specific_lineage.[lineage].[assembler]-[bin].txt`: BUSCO summary of the results in case a more specific lineage than the domain could be selected or for the lineage provided via `--busco_reference`. + * `[assembler]-[bin]_buscos.[lineage].fna.gz`: Nucleotide sequence of all identified BUSCOs for used lineages (domain or specific). + * `[assembler]-[bin]_buscos.[lineage].faa.gz`: Aminoacid sequence of all identified BUSCOs for used lineages (domain or specific). -If the parameter `--save_busco_reference` is set the used BUSCO reference is stored. +If the parameter `--save_busco_reference` is set, additionally the used BUSCO lineage datasets are stored in the output directy. **Output files:** -* `GenomeBinning/QC/BUSCO/reference/` - * `*.tar.gz`: BUSCO reference file +* `GenomeBinning/QC/BUSCO/` + * `busco_downloads/`: All files and lineage datasets downloaded by BUSCO when run in automated lineage selection mode. (Can currently not be used to reproduce analysis, see the [nf-core/mag website documentation](https://nf-co.re/mag/usage#reproducibility) how to achieve reproducible BUSCO results). + * `reference/*.tar.gz`: BUSCO reference lineage dataset that was provided via `--busco_reference`. + +Besides the reference files or output files created by BUSCO, the following summary files will be generated: **Output files:** * `GenomeBinning/QC/` - * `busco_summary.txt`: A summary table of the BUSCO results, with % of marker genes found + * `busco_summary.tsv`: A summary table of the BUSCO results, with % of marker genes found. If run in automated lineage selection mode, both the results for the selected domain and for the selected more specific lineage will be given, if available. * `quast_and_busco_summary.tsv`; Summary of BUSCO and QUAST results ## MultiQC diff --git a/docs/usage.md b/docs/usage.md index ca0658c9..c4ebd390 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -48,7 +48,7 @@ You can fix this by using the prameter `--megahit_fix_cpu_1`. In both cases, do MetaBAT2 is run by default with a fixed seed within this pipeline, thus producing reproducible results. -To allow also reproducible bin QC with BUSCO, the parameter `--save_busco_reference` can be used to store the reference database. This may be useful since BUSCO datasets are frequently updated and old versions do not always remain accessible. +To allow also reproducible bin QC with BUSCO, run BUSCO providing already downloaded lineage datasets with `--busco_download_path` (BUSCO will be run using automated lineage selection in offline mode) or provide a specific lineage dataset via `--busco_reference` and use the parameter `--save_busco_reference`. This may be useful since BUSCO datasets are frequently updated and old versions do not always remain (easily) accessible. ## Core Nextflow arguments diff --git a/lib/Completion.groovy b/lib/Completion.groovy index e745c986..15bb70c4 100644 --- a/lib/Completion.groovy +++ b/lib/Completion.groovy @@ -3,10 +3,13 @@ */ class Completion { - static void email(workflow, params, summary_params, projectDir, log, multiqc_report=[]) { + static void email(workflow, params, summary_params, projectDir, log, multiqc_report=[], busco_failed_bins = [:]) { // Set up the e-mail variables def subject = "[$workflow.manifest.name] Successful: $workflow.runName" + if (busco_failed_bins.size() > 0) { + subject = "[$workflow.manifest.name] Partially successful: For ${busco_failed_bins.size()} bin(s) the BUSCO analysis failed because no genes where found or placements failed: $workflow.runName" + } if (!workflow.success) { subject = "[$workflow.manifest.name] FAILED: $workflow.runName" } @@ -40,6 +43,7 @@ class Completion { email_fields['commandLine'] = workflow.commandLine email_fields['projectDir'] = workflow.projectDir email_fields['summary'] = summary << misc_fields + email_fields['busco_failed_bins'] = busco_failed_bins.keySet() // On success try attach the multiqc report def mqc_report = null @@ -111,8 +115,34 @@ class Completion { output_tf.withWriter { w -> w << email_txt } } - static void summary(workflow, params, log) { + static void summary(workflow, params, log, busco_failed_bins = [:]) { Map colors = Headers.log_colours(params.monochrome_logs) + + if (busco_failed_bins.size() > 0) { + def failed_bins_no_genes = '' + def failed_bins_placements_failed = '' + def count_no_genes = 0 + def count_placements_failed = 0 + for (bin in busco_failed_bins) { + if (bin.value == "No genes"){ + count_no_genes += 1 + failed_bins_no_genes += " ${bin.key}\n" + } + if (bin.value == "Placements failed"){ + count_placements_failed += 1 + failed_bins_placements_failed += " ${bin.key}\n" + } + } + if (params.busco_reference) + log.info "-${colors.purple}[$workflow.manifest.name]${colors.yellow} For ${busco_failed_bins.size()} bin(s) BUSCO did not find any matching genes:\n${failed_bins_no_genes}See ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.log for further information.${colors.reset}-" + else { + if (count_no_genes > 0) + log.info "-${colors.purple}[$workflow.manifest.name]${colors.yellow} For ${count_no_genes} bin(s) the BUSCO analysis failed because no BUSCO genes could be found:\n${failed_bins_no_genes}See ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.err and ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.log for further information.${colors.reset}-" + if (count_placements_failed > 0) + log.info "-${colors.purple}[$workflow.manifest.name]${colors.yellow} For ${count_placements_failed} bin(s) the BUSCO analysis using automated lineage selection failed due to failed placements:\n${failed_bins_placements_failed}See ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.err and ${params.outdir}/GenomeBinning/QC/BUSCO/[bin]_busco.log for further information. Results for selected domain are still used.${colors.reset}-" + } + } + if (workflow.success) { if (workflow.stats.ignoredCount == 0) { log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Pipeline completed successfully${colors.reset}-" diff --git a/main.nf b/main.nf index 19099d3e..c41c6b02 100644 --- a/main.nf +++ b/main.nf @@ -83,7 +83,7 @@ include { // Local: Sub-workflows include { INPUT_CHECK } from './subworkflows/local/input_check' include { METABAT2_BINNING } from './subworkflows/local/metabat2_binning' addParams( bowtie2_align_options: modules['bowtie2_assembly_align'], metabat2_options: modules['metabat2'] ) -include { BUSCO_QC } from './subworkflows/local/busco_qc' addParams( busco_db_options: modules['busco_db_preparation'], busco_options: modules['busco'], busco_plot_options: modules['busco_plot'], busco_summary_options: modules['busco_summary']) +include { BUSCO_QC } from './subworkflows/local/busco_qc' addParams( busco_db_options: modules['busco_db_preparation'], busco_options: modules['busco'], busco_save_download_options: modules['busco_save_download'], busco_plot_options: modules['busco_plot'], busco_summary_options: modules['busco_summary']) // nf-core/modules: Modules include { FASTQC as FASTQC_RAW } from './modules/nf-core/software/fastqc/main' addParams( options: modules['fastqc_raw'] ) @@ -203,17 +203,37 @@ if ( params.host_genome ) { ch_host_fasta = Channel.empty() } +if (params.skip_busco){ + if (params.busco_reference) + exit 1, "Both --skip_busco and --busco_reference are specififed! Invalid combination, please specify either --skip_busco or --busco_reference." + if (params.busco_download_path) + exit 1, "Both --skip_busco and --busco_download_path are specififed! Invalid combination, please specify either --skip_busco or --busco_download_path." + if (params.busco_auto_lineage_prok) + exit 1, "Both --skip_busco and --busco_auto_lineage_prok are specififed! Invalid combination, please specify either --skip_busco or --busco_auto_lineage_prok." +} +if (params.busco_reference && params.busco_download_path) + exit 1, "Both --busco_reference and --busco_download_path are specififed! Invalid combination, please specify either --busco_reference or --busco_download_path." +if (params.busco_auto_lineage_prok && params.busco_reference) + exit 1, "Both --busco_auto_lineage_prok and --busco_reference are specififed! Invalid combination, please specify either --busco_auto_lineage_prok or --busco_reference." + //////////////////////////////////////////////////// /* -- Create channel for reference databases -- */ //////////////////////////////////////////////////// -if(!params.skip_busco){ +if(params.busco_reference){ Channel .value(file( "${params.busco_reference}" )) .set { ch_busco_db_file } } else { ch_busco_db_file = Channel.empty() } +if (params.busco_download_path) { + Channel + .value(file( "${params.busco_download_path}" )) + .set { ch_busco_download_folder } +} else { + ch_busco_download_folder = Channel.empty() +} if(params.centrifuge_db){ Channel @@ -263,7 +283,8 @@ ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multi //////////////////////////////////////////////////// // Info required for completion email and summary -def multiqc_report = [] +def multiqc_report = [] +def busco_failed_bins = [:] workflow { @@ -532,6 +553,7 @@ workflow { */ ch_bowtie2_assembly_multiqc = Channel.empty() + ch_busco_summary = Channel.empty() ch_busco_multiqc = Channel.empty() if (!params.skip_binning){ @@ -543,22 +565,31 @@ workflow { ch_software_versions = ch_software_versions.mix(METABAT2_BINNING.out.bowtie2_version.first().ifEmpty(null)) ch_software_versions = ch_software_versions.mix(METABAT2_BINNING.out.metabat2_version.first().ifEmpty(null)) - /* - * BUSCO subworkflow: Quantitative measures for the assessment of genome assembly - */ - BUSCO_QC ( - ch_busco_db_file, - METABAT2_BINNING.out.bins.transpose() - ) - ch_busco_multiqc = BUSCO_QC.out.multiqc - ch_software_versions = ch_software_versions.mix(BUSCO_QC.out.version.first().ifEmpty(null)) + if (!params.skip_busco){ + /* + * BUSCO subworkflow: Quantitative measures for the assessment of genome assembly + */ + BUSCO_QC ( + ch_busco_db_file, + ch_busco_download_folder, + METABAT2_BINNING.out.bins.transpose() + ) + ch_busco_summary = BUSCO_QC.out.summary + ch_busco_multiqc = BUSCO_QC.out.multiqc + ch_software_versions = ch_software_versions.mix(BUSCO_QC.out.version.first().ifEmpty(null)) + // process information if BUSCO analysis failed for individual bins due to no matching genes + BUSCO_QC.out + .failed_bin + .splitCsv(sep: '\t') + .map { bin, error -> if (!bin.contains(".unbinned.")) busco_failed_bins[bin] = error } + } if (!params.skip_quast){ QUAST_BINS ( METABAT2_BINNING.out.bins ) ch_software_versions = ch_software_versions.mix(QUAST_BINS.out.version.first().ifEmpty(null)) MERGE_QUAST_AND_BUSCO ( QUAST_BINS.out.quast_bin_summaries.collect(), - BUSCO_QC.out.summary + ch_busco_summary ) } @@ -612,8 +643,8 @@ workflow { //////////////////////////////////////////////////// workflow.onComplete { - Completion.email(workflow, params, summary_params, projectDir, log, multiqc_report) - Completion.summary(workflow, params, log) + Completion.email(workflow, params, summary_params, projectDir, log, multiqc_report, busco_failed_bins) + Completion.summary(workflow, params, log, busco_failed_bins) } //////////////////////////////////////////////////// diff --git a/modules/local/busco.nf b/modules/local/busco.nf index ae19d2a4..089cc2f4 100644 --- a/modules/local/busco.nf +++ b/modules/local/busco.nf @@ -9,38 +9,51 @@ process BUSCO { publishDir "${params.outdir}", mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + saveAs: { filename -> filename.indexOf("busco_downloads") == -1 ? saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') : null } - conda (params.enable_conda ? "bioconda::busco=4.1.4" : null) + conda (params.enable_conda ? "bioconda::busco=5.1.0" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/busco:4.1.4--py_2" + container "https://depot.galaxyproject.org/singularity/busco:5.1.0--py_1" } else { - container "quay.io/biocontainers/busco:4.1.4--py_2" + container "quay.io/biocontainers/busco:5.1.0--py_1" } input: tuple val(meta), path(bin) path(db) + path(download_folder) output: - tuple val(meta), path("short_summary.specific.*.${bin}.txt"), emit: summary + tuple val(meta), path("short_summary.domain.*.${bin}.txt") , optional:true , emit: summary_domain + tuple val(meta), path("short_summary.specific_lineage.*.${bin}.txt"), optional:true , emit: summary_specific + tuple env(most_spec_db), path('busco_downloads/') , optional:true , emit: busco_downloads path("${bin}_busco.log") - path("${bin}_buscos.faa.gz") optional true - path("${bin}_buscos.fna.gz") optional true - path '*.version.txt' , emit: version + path("${bin}_busco.err") + path("${bin}_buscos.*.faa.gz") , optional:true + path("${bin}_buscos.*.fna.gz") , optional:true + tuple val(meta), path("${bin}_busco.failed_bin.txt") , optional:true , emit: failed_bin + path '*.version.txt' , emit: version script: def software = getSoftwareName(task.process) - if( workflow.profile.toString().indexOf("conda") == -1) - cp_augustus_config = "Y" - else + def cp_augustus_config = "Y" + if( workflow.profile.toString().indexOf("conda") != -1) cp_augustus_config = "N" - """ - # get path to custom config file for busco (already configured during conda installation) - busco_path="\$(which busco)" - config_file="\${busco_path%bin/busco}share/busco/config.ini" + def lineage_dataset_provided = "N" + if (params.busco_reference) + lineage_dataset_provided = "Y" + def p = "--auto-lineage" + if (params.busco_reference){ + p = "--lineage_dataset dataset/${db}" + } else { + if (params.busco_auto_lineage_prok) + p = "--auto-lineage-prok" + if (params.busco_download_path) + p += " --offline --download_path ${download_folder}" + } + """ # ensure augustus has write access to config directory if [ ${cp_augustus_config} = "Y" ] ; then cp -r /usr/local/config/ augustus_config/ @@ -48,38 +61,125 @@ process BUSCO { fi # place db in extra folder to ensure BUSCO recognizes it as path (instead of downloading it) - mkdir dataset - mv ${db} dataset/ + if [ ${lineage_dataset_provided} = "Y" ] ; then + mkdir dataset + mv ${db} dataset/ + fi - busco --lineage_dataset dataset/${db} \ + # set nullgob: if pattern matches no files, expand to a null string rather than to itself + shopt -s nullglob + + # only used for saving busco downloads + most_spec_db="NA" + + if busco ${p} \ --mode genome \ --in ${bin} \ - --config \${config_file} \ --cpu "${task.cpus}" \ - --out "BUSCO" > ${bin}_busco.log + --out "BUSCO" > ${bin}_busco.log 2> ${bin}_busco.err; then - # get used db name - # (set nullgob: if pattern matches no files, expand to a null string rather than to itself) - shopt -s nullglob - summaries=(BUSCO/short_summary.specific.*.BUSCO.txt) - if [ \${#summaries[@]} -ne 1 ]; then - echo "ERROR: none or multiple 'BUSCO/short_summary.specific.*.BUSCO.txt' files found. Expected one." + # get name of used specific lineage dataset + summaries=(BUSCO/short_summary.specific.*.BUSCO.txt) + if [ \${#summaries[@]} -ne 1 ]; then + echo "ERROR: none or multiple 'BUSCO/short_summary.specific.*.BUSCO.txt' files found. Expected one." + exit 1 + fi + [[ \$summaries =~ BUSCO/short_summary.specific.(.*).BUSCO.txt ]]; + db_name_spec="\${BASH_REMATCH[1]}" + most_spec_db=\${db_name_spec} + echo "Used specific lineage dataset: \${db_name_spec}" + + if [ ${lineage_dataset_provided} = "Y" ]; then + cp BUSCO/short_summary.specific.\${db_name_spec}.BUSCO.txt short_summary.specific_lineage.\${db_name_spec}.${bin}.txt + + # if lineage dataset is provided, BUSCO analysis does not fail in case no genes can be found as when using the auto selection setting + # report bin as failed to allow consistent warnings within the pipeline for both settings + if egrep -q \$'WARNING:\tBUSCO did not find any match.' ${bin}_busco.log ; then + echo "WARNING: BUSCO could not find any genes for the provided lineage dataset! See also ${bin}_busco.log." + echo -e "${bin}\tNo genes" > "${bin}_busco.failed_bin.txt" + fi + else + # auto lineage selection + if { egrep -q \$'INFO:\t\\S+ selected' ${bin}_busco.log \ + && egrep -q \$'INFO:\tLineage \\S+ is selected, supported by ' ${bin}_busco.log ; } || \ + { egrep -q \$'INFO:\t\\S+ selected' ${bin}_busco.log \ + && egrep -q \$'INFO:\tThe results from the Prodigal gene predictor indicate that your data belongs to the mollicutes clade. Testing subclades...' ${bin}_busco.log \ + && egrep -q \$'INFO:\tUsing local lineages directory ' ${bin}_busco.log ; }; then + # the second statement is necessary, because certain mollicute clades use a different genetic code, are not part of the BUSCO placement tree, are tested separately + # and cause different log messages + echo "Domain and specific lineage could be selected by BUSCO." + cp BUSCO/short_summary.specific.\${db_name_spec}.BUSCO.txt short_summary.specific_lineage.\${db_name_spec}.${bin}.txt + + summaries_gen=(BUSCO/short_summary.generic.*.BUSCO.txt) + if [ \${#summaries_gen[@]} -ne 1 ]; then + echo "ERROR: none or multiple 'BUSCO/short_summary.generic.*.BUSCO.txt' files found. Expected one." + exit 1 + fi + [[ \$summaries_gen =~ BUSCO/short_summary.generic.(.*).BUSCO.txt ]]; + db_name_gen="\${BASH_REMATCH[1]}" + echo "Used generic lineage dataset: \${db_name_gen}" + cp BUSCO/short_summary.generic.\${db_name_gen}.BUSCO.txt short_summary.domain.\${db_name_gen}.${bin}.txt + + for f in BUSCO/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa; do + cat BUSCO/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.\${db_name_gen}.faa.gz + break + done + for f in BUSCO/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna; do + cat BUSCO/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.\${db_name_gen}.fna.gz + break + done + + elif egrep -q \$'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q \$'INFO:\tNot enough markers were placed on the tree \\([0-9]*\\). Root lineage \\S+ is kept' ${bin}_busco.log ; then + echo "Domain could be selected by BUSCO, but no more specific lineage." + cp BUSCO/short_summary.specific.\${db_name_spec}.BUSCO.txt short_summary.domain.\${db_name_spec}.${bin}.txt + + elif egrep -q \$'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q \$'INFO:\tRunning virus detection pipeline' ${bin}_busco.log ; then + # TODO double-check if selected dataset is not one of bacteria_*, archaea_*, eukaryota_*? + echo "Domain could not be selected by BUSCO, but virus dataset was selected." + cp BUSCO/short_summary.specific.\${db_name_spec}.BUSCO.txt short_summary.specific_lineage.\${db_name_spec}.${bin}.txt + else + echo "ERROR: Some not expected case occurred! See ${bin}_busco.log." >&2 + exit 1 + fi + fi + + for f in BUSCO/run_\${db_name_spec}/busco_sequences/single_copy_busco_sequences/*faa; do + cat BUSCO/run_\${db_name_spec}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.\${db_name_spec}.faa.gz + break + done + for f in BUSCO/run_\${db_name_spec}/busco_sequences/single_copy_busco_sequences/*fna; do + cat BUSCO/run_\${db_name_spec}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.\${db_name_spec}.fna.gz + break + done + + elif egrep -q \$'ERROR:\tNo genes were recognized by BUSCO' ${bin}_busco.err ; then + echo "WARNING: BUSCO analysis failed due to no recognized genes! See also ${bin}_busco.err." + echo -e "${bin}\tNo genes" > "${bin}_busco.failed_bin.txt" + + elif egrep -q \$'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q \$'ERROR:\tPlacements failed' ${bin}_busco.err ; then + echo "WARNING: BUSCO analysis failed due to failed placements! See also ${bin}_busco.err. Still using results for selected generic lineage dataset." + echo -e "${bin}\tPlacements failed" > "${bin}_busco.failed_bin.txt" + + message=\$(egrep \$'INFO:\t\\S+ selected' ${bin}_busco.log) + [[ \$message =~ INFO:[[:space:]]([_[:alnum:]]+)[[:space:]]selected ]]; + db_name_gen="\${BASH_REMATCH[1]}" + most_spec_db=\${db_name_gen} + echo "Used generic lineage dataset: \${db_name_gen}" + cp BUSCO/auto_lineage/run_\${db_name_gen}/short_summary.txt short_summary.domain.\${db_name_gen}.${bin}.txt + + for f in BUSCO/auto_lineage/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa; do + cat BUSCO/auto_lineage/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.\${db_name_gen}.faa.gz + break + done + for f in BUSCO/auto_lineage/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna; do + cat BUSCO/auto_lineage/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.\${db_name_gen}.fna.gz + break + done + + else + echo "ERROR: BUSCO analysis failed for some unknown reason! See also ${bin}_busco.err." >&2 exit 1 fi - [[ \$summaries =~ BUSCO/short_summary.specific.(.*).BUSCO.txt ]]; - db_name="\${BASH_REMATCH[1]}" - echo "Used database: \${db_name}" - - cp BUSCO/short_summary.specific.\${db_name}.BUSCO.txt short_summary.specific.\${db_name}.${bin}.txt - - for f in BUSCO/run_\${db_name}/busco_sequences/single_copy_busco_sequences/*faa; do - cat BUSCO/run_\${db_name}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.faa.gz - break - done - for f in BUSCO/run_\${db_name}/busco_sequences/single_copy_busco_sequences/*fna; do - cat BUSCO/run_\${db_name}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.fna.gz - break - done busco --version | sed "s/BUSCO //" > ${software}.version.txt """ diff --git a/modules/local/busco_plot.nf b/modules/local/busco_plot.nf index 5c1a9643..d9813370 100644 --- a/modules/local/busco_plot.nf +++ b/modules/local/busco_plot.nf @@ -11,40 +11,41 @@ process BUSCO_PLOT { mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } - conda (params.enable_conda ? "bioconda::busco=4.1.4" : null) + conda (params.enable_conda ? "bioconda::busco=5.1.0" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/busco:4.1.4--py_2" + container "https://depot.galaxyproject.org/singularity/busco:5.1.0--py_1" } else { - container "quay.io/biocontainers/busco:4.1.4--py_2" + container "quay.io/biocontainers/busco:5.1.0--py_1" } input: tuple val(meta), path(summaries) output: - path("${meta.assembler}-${meta.id}-busco_figure.png") , emit: png - path("${meta.assembler}-${meta.id}-busco_figure.R") , emit: rscript - path("${meta.assembler}-${meta.id}-busco_summary.txt"), emit: summary - path '*.version.txt' , emit: version + path("${meta.assembler}-${meta.id}.*.busco_figure.png") , optional:true, emit: png + path("${meta.assembler}-${meta.id}.*.busco_figure.R") , optional:true, emit: rscript + path '*.version.txt' , emit: version script: def software = getSoftwareName(task.process) """ - # replace dots in bin names within summary file names by underscores - # currently (BUSCO v4.1.3) generate_plot.py does not allow further dots - for sum in ${summaries}; do - [[ \${sum} =~ short_summary.(.*).${meta.assembler}-${meta.id}.(.*).txt ]]; - db_name=\${BASH_REMATCH[1]} - bin="${meta.assembler}-${meta.id}.\${BASH_REMATCH[2]}" - bin_new="\${bin//./_}" - mv \${sum} short_summary.\${db_name}.\${bin_new}.txt - done - generate_plot.py --working_directory . - - mv busco_figure.png "${meta.assembler}-${meta.id}-busco_figure.png" - mv busco_figure.R "${meta.assembler}-${meta.id}-busco_figure.R" - - summary_busco.py short_summary.*.txt > "${meta.assembler}-${meta.id}-busco_summary.txt" + if [ -n "${summaries}" ] + then + # replace dots in bin names within summary file names by underscores + # currently (BUSCO v5.1.0) generate_plot.py does not allow further dots + for sum in ${summaries}; do + [[ \${sum} =~ short_summary.([_[:alnum:]]+).([_[:alnum:]]+).${meta.assembler}-${meta.id}.(.+).txt ]]; + mode=\${BASH_REMATCH[1]} + db_name=\${BASH_REMATCH[2]} + bin="${meta.assembler}-${meta.id}.\${BASH_REMATCH[3]}" + bin_new="\${bin//./_}" + mv \${sum} short_summary.\${mode}.\${db_name}.\${bin_new}.txt + done + generate_plot.py --working_directory . + + mv busco_figure.png "${meta.assembler}-${meta.id}.\${mode}.\${db_name}.busco_figure.png" + mv busco_figure.R "${meta.assembler}-${meta.id}.\${mode}.\${db_name}.busco_figure.R" + fi busco --version | sed "s/BUSCO //" > ${software}.version.txt """ diff --git a/modules/local/busco_save_download.nf b/modules/local/busco_save_download.nf new file mode 100644 index 00000000..0e0d6c2e --- /dev/null +++ b/modules/local/busco_save_download.nf @@ -0,0 +1,26 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process BUSCO_SAVE_DOWNLOAD { + // execute sequentially to avoid artefacts when saving files for multiple busco instances + maxForks 1 + + // do not overwrite existing files which were already saved for other busco runs + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + overwrite: false, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } + + input: + path(busco_downloads) + + output: + path('busco_downloads/**', includeInputs: true) + + script: + """ + """ +} diff --git a/modules/local/busco_summary.nf b/modules/local/busco_summary.nf index 2fdac6ad..0873287f 100644 --- a/modules/local/busco_summary.nf +++ b/modules/local/busco_summary.nf @@ -10,21 +10,30 @@ process BUSCO_SUMMARY { mode: params.publish_dir_mode, saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), publish_id:'') } - conda (params.enable_conda ? "conda-forge::python:3.6.7" : null) + conda (params.enable_conda ? "conda-forge::pandas=1.1.5" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/python:3.6.7" + container "https://depot.galaxyproject.org/singularity/pandas:1.1.5" } else { - container "quay.io/biocontainers/python:3.6.7" + container "quay.io/biocontainers/pandas:1.1.5" } input: - path "short_summary.*.txt" + path(summaries_domain) + path(summaries_specific) + path(failed_bins) output: - path "busco_summary.txt", emit: summary + path "busco_summary.tsv", emit: summary script: + def auto = params.busco_reference ? "" : "-a" + def ss = summaries_specific.size() > 0 ? "-ss ${summaries_specific}" : "" + def sd = summaries_domain.size() > 0 ? "-sd ${summaries_domain}" : "" + def f = "" + if (!params.busco_reference && failed_bins.size() > 0) + f = "-f ${failed_bins}" """ - summary_busco.py short_summary.*.txt > busco_summary.txt + summary_busco.py $auto $ss $sd $f -o busco_summary.tsv """ } + diff --git a/nextflow.config b/nextflow.config index d56fac34..250942e1 100644 --- a/nextflow.config +++ b/nextflow.config @@ -57,7 +57,9 @@ params { // Bin QC skip_busco = false - busco_reference = "https://busco-data.ezlab.org/v4/data/lineages/bacteria_odb10.2020-03-06.tar.gz" + busco_reference = '' + busco_download_path = '' + busco_auto_lineage_prok = false save_busco_reference = false // Reproducibility options diff --git a/nextflow_schema.json b/nextflow_schema.json index a1df7774..cf659588 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -472,13 +472,21 @@ }, "busco_reference": { "type": "string", - "default": "https://busco-data.ezlab.org/v4/data/lineages/bacteria_odb10.2020-03-06.tar.gz", - "description": "Download path for BUSCO database.", - "help_text": "Available databases are listed here: https://busco.ezlab.org/." + "description": "Download path for BUSCO lineage dataset, instead of using automated lineage selection.", + "help_text": "E.g. https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz. Available databases are listed here: https://busco-data.ezlab.org/v5/data/lineages/." + }, + "busco_download_path": { + "type": "string", + "description": "Path to local folder containing already downloaded and unpacked lineage datasets.", + "help_text": "If provided, BUSCO analysis will be run in offline mode. Data can be downloaded from https://busco-data.ezlab.org/v5/data/ (files still need to be unpacked manually). Run in combination with automated lineage selection." + }, + "busco_auto_lineage_prok": { + "type": "boolean", + "description": "Run BUSCO with automated lineage selection, but ignoring eukaryotes (saves runtime)." }, "save_busco_reference": { "type": "boolean", - "description": "Save BUSCO reference.", + "description": "Save the used BUSCO lineage datasets provided via --busco_reference or downloaded when not using --busco_reference or --busco_download_path.", "help_text": "Useful to allow reproducibility, as BUSCO datasets are frequently updated and old versions do not always remain accessible." } } diff --git a/subworkflows/local/busco_qc.nf b/subworkflows/local/busco_qc.nf index 8f053aef..eb7aa92c 100644 --- a/subworkflows/local/busco_qc.nf +++ b/subworkflows/local/busco_qc.nf @@ -2,34 +2,58 @@ * BUSCO: Quantitative measures for the assessment of genome assembly */ -params.busco_db_options = [:] -params.busco_options = [:] -params.busco_plot_options = [:] -params.busco_summary_options = [:] +params.busco_db_options = [:] +params.busco_options = [:] +params.busco_save_download_options = [:] +params.busco_plot_options = [:] +params.busco_summary_options = [:] -include { BUSCO_DB_PREPARATION } from '../../modules/local/busco_db_preparation' addParams( options: params.busco_db_options ) -include { BUSCO } from '../../modules/local/busco' addParams( options: params.busco_options ) -include { BUSCO_PLOT } from '../../modules/local/busco_plot' addParams( options: params.busco_plot_options ) -include { BUSCO_SUMMARY } from '../../modules/local/busco_summary' addParams( options: params.busco_summary_options ) +include { BUSCO_DB_PREPARATION } from '../../modules/local/busco_db_preparation' addParams( options: params.busco_db_options ) +include { BUSCO } from '../../modules/local/busco' addParams( options: params.busco_options ) +include { BUSCO_SAVE_DOWNLOAD } from '../../modules/local/busco_save_download' addParams( options: params.busco_save_download_options ) +include { BUSCO_PLOT } from '../../modules/local/busco_plot' addParams( options: params.busco_plot_options ) +include { BUSCO_SUMMARY } from '../../modules/local/busco_summary' addParams( options: params.busco_summary_options ) workflow BUSCO_QC { take: busco_db_file // channel: path + busco_download_folder // channel: path bins // channel: [ val(meta), path(bin) ] main: - BUSCO_DB_PREPARATION ( busco_db_file ) + if (params.busco_reference){ + BUSCO_DB_PREPARATION ( busco_db_file ) + ch_busco_db = BUSCO_DB_PREPARATION.out.db + } else { + ch_busco_db = Channel.empty() + } BUSCO ( bins, - BUSCO_DB_PREPARATION.out.db + ch_busco_db.collect().ifEmpty([]), + busco_download_folder.collect().ifEmpty([]) ) - + if (params.save_busco_reference){ + // publish files downloaded by Busco + ch_downloads = BUSCO.out.busco_downloads.groupTuple().map{lin,downloads -> downloads[0]}.toSortedList().flatten() + BUSCO_SAVE_DOWNLOAD ( ch_downloads ) + } // group by assembler and sample name for plotting - BUSCO_PLOT ( BUSCO.out.summary.groupTuple(by: 0) ) - BUSCO_SUMMARY ( BUSCO.out.summary.map{it[1]}.collect() ) + // note: failed bins and bins with no domain, i.e. with viral lineages selected, will not be represented in these plots + ch_results_busco_plot = BUSCO.out.summary_specific.groupTuple(by: 0) + if (!params.busco_reference){ + ch_results_busco_plot = ch_results_busco_plot.mix(BUSCO.out.summary_domain.groupTuple(by: 0)) + } + BUSCO_PLOT ( ch_results_busco_plot ) + + BUSCO_SUMMARY ( + BUSCO.out.summary_domain.map{it[1]}.collect().ifEmpty([]), + BUSCO.out.summary_specific.map{it[1]}.collect().ifEmpty([]), + BUSCO.out.failed_bin.map{it[1]}.collect().ifEmpty([]) + ) emit: - summary = BUSCO_SUMMARY.out.summary - multiqc = BUSCO.out.summary.map{it[1]} - version = BUSCO.out.version + summary = BUSCO_SUMMARY.out.summary + failed_bin = BUSCO.out.failed_bin.map{it[1]} + multiqc = BUSCO.out.summary_domain.map{it[1]} + version = BUSCO.out.version }