diff --git a/CHANGELOG.md b/CHANGELOG.md index 04c6bc97..6dcc5d13 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#353](https://github.com/nf-core/mag/pull/353) - Added the busco_clean parameter to optionally clean each BUSCO directory after a successful - [#361](https://github.com/nf-core/mag/pull/361) - Added the skip_clipping parameter to skip read preprocessing with fastp or adapterremoval. Running the pipeline with skip_clipping, keep_phix and without specifying a host genome or fasta file skips the FASTQC_TRIMMED process. - [#365](https://github.com/nf-core/mag/pull/365) - Adds CONCOCT as an additional (optional) binning tool +- [#366](https://github.com/nf-core/mag/pull/366) - Added CAT_SUMMARISE process and cat_official_taxonomy parameter. ### `Changed` diff --git a/conf/modules.config b/conf/modules.config index 8ed7998c..1d753fc3 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -284,6 +284,14 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + withName: CAT_SUMMARY { + ext.prefix = "cat_summary" + publishDir = [ + path: { "${params.outdir}/Taxonomy/CAT/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } withName: GTDBTK_CLASSIFY { ext.args = "--extension fa" diff --git a/conf/test_full.config b/conf/test_full.config index 34e81f1a..ba1a37ec 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -30,4 +30,7 @@ params { megahit_fix_cpu_1 = true // available options to enable reproducibility for BUSCO (--busco_download_path or --busco_reference) not used here // to allow detection of possible problems in automated lineage selection mode using public databases + + // test CAT with official taxonomic ranks only + cat_official_taxonomy = true } diff --git a/modules/local/cat.nf b/modules/local/cat.nf index a4943c28..0bf2c167 100644 --- a/modules/local/cat.nf +++ b/modules/local/cat.nf @@ -20,10 +20,11 @@ process CAT { path "versions.yml" , emit: versions script: + def official_taxonomy = params.cat_official_taxonomy ? "--official_taxonomy" : "" """ CAT bins -b "bins/" -d database/ -t taxonomy/ -n "${task.cpus}" -s .fa --top 6 -o "${meta.assembler}-${meta.binner}-${meta.id}" --I_know_what_Im_doing - CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.names.txt" -t taxonomy/ - CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.names.txt" -t taxonomy/ + CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.names.txt" -t taxonomy/ ${official_taxonomy} + CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.names.txt" -t taxonomy/ ${official_taxonomy} mkdir raw mv *.ORF2LCA.txt *.predicted_proteins.faa *.predicted_proteins.gff *.log *.bin2classification.txt raw/ diff --git a/modules/local/cat_summary.nf b/modules/local/cat_summary.nf new file mode 100644 index 00000000..25958bbc --- /dev/null +++ b/modules/local/cat_summary.nf @@ -0,0 +1,29 @@ +process CAT_SUMMARY { + label 'process_low' + + conda (params.enable_conda ? "bioconda::bioawk=1.0" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioawk:1.0--hed695b0_5' : + 'quay.io/biocontainers/bioawk:1.0--hed695b0_5' }" + + input: + path(cat_summaries) + + output: + path("*.tsv") , emit: combined + path "versions.yml", emit: versions + + script: + def prefix = task.ext.prefix ?: "cat_summary" + """ + # use find as sometimes these are empty and need to fail gracefully + find -L -type f -name "*bin2classification.names.txt.gz" -exec sh -c 'for f do gunzip -c \$f > \${f%.*}; done' find-sh {} + + + bioawk '(NR == 1) || (FNR > 1)' *.txt > ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bioawk: \$(bioawk --version | cut -f 3 -d ' ' ) + END_VERSIONS + """ +} diff --git a/nextflow.config b/nextflow.config index 9aa4d476..37c146ba 100644 --- a/nextflow.config +++ b/nextflow.config @@ -67,6 +67,7 @@ params { skip_krona = false cat_db = null cat_db_generate = false + cat_official_taxonomy = false save_cat_db = false gtdb = "https://data.ace.uq.edu.au/public/gtdb/data/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz" gtdbtk_min_completeness = 50.0 diff --git a/nextflow_schema.json b/nextflow_schema.json index 110cd078..a9c044c5 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -447,6 +447,10 @@ "description": "Save the CAT database generated when specified by `--cat_db_generate`.", "help_text": "Useful to allow reproducibility, as old versions of prebuild CAT databases do not always remain accessible and underlying NCBI taxonomy and nr databases change." }, + "cat_official_taxonomy": { + "type": "boolean", + "description": "Only return official taxonomic ranks (Kingdom, Phylum, etc.) when running CAT." + }, "gtdb": { "type": "string", "default": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz", @@ -641,7 +645,7 @@ "type": "number", "default": 0.5, "description": "Specify single-copy gene score threshold for bin refinement.", - "help_text": "Score threshold for single-copy gene selection algorithm to keep selecting bins, with a value ranging from 0-1.\n\nFor description of scoring algorithm, see: Sieber, Christian M. K., et al. 2018. Nature Microbiology 3 (7): 836\u201343. https://doi.org/10.1038/s41564-018-0171-1.\n\n> Modifies DAS Tool parameter --score_threshold\n" + "help_text": "Score threshold for single-copy gene selection algorithm to keep selecting bins, with a value ranging from 0-1.\n\nFor description of scoring algorithm, see: Sieber, Christian M. K., et al. 2018. Nature Microbiology 3 (7): 836–43. https://doi.org/10.1038/s41564-018-0171-1.\n\n> Modifies DAS Tool parameter --score_threshold\n" }, "postbinning_input": { "type": "string", diff --git a/workflows/mag.nf b/workflows/mag.nf index 4568e77a..fa68c401 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -83,6 +83,7 @@ include { QUAST_BINS_SUMMARY } from '../modules include { CAT_DB } from '../modules/local/cat_db' include { CAT_DB_GENERATE } from '../modules/local/cat_db_generate' include { CAT } from '../modules/local/cat' +include { CAT_SUMMARY } from "../modules/local/cat_summary" include { BIN_SUMMARY } from '../modules/local/bin_summary' include { COMBINE_TSV } from '../modules/local/combine_tsv' include { MULTIQC } from '../modules/local/multiqc' @@ -640,7 +641,11 @@ workflow MAG { ch_input_for_postbinning_bins, ch_cat_db ) + CAT_SUMMARY( + CAT.out.tax_classification.collect() + ) ch_versions = ch_versions.mix(CAT.out.versions.first()) + ch_versions = ch_versions.mix(CAT_SUMMARY.out.versions.first()) /* * GTDB-tk: taxonomic classifications using GTDB reference