Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CAT_SUMMARY process and offical_taxonomy param #366

Merged
merged 8 commits into from
Dec 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#353](https://github.com/nf-core/mag/pull/353) - Added the busco_clean parameter to optionally clean each BUSCO directory after a successful
- [#361](https://github.com/nf-core/mag/pull/361) - Added the skip_clipping parameter to skip read preprocessing with fastp or adapterremoval. Running the pipeline with skip_clipping, keep_phix and without specifying a host genome or fasta file skips the FASTQC_TRIMMED process.
- [#365](https://github.com/nf-core/mag/pull/365) - Adds CONCOCT as an additional (optional) binning tool
- [#366](https://github.com/nf-core/mag/pull/366) - Added CAT_SUMMARISE process and cat_official_taxonomy parameter.

### `Changed`

Expand Down
8 changes: 8 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,14 @@ process {
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}
withName: CAT_SUMMARY {
ext.prefix = "cat_summary"
publishDir = [
path: { "${params.outdir}/Taxonomy/CAT/" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: GTDBTK_CLASSIFY {
ext.args = "--extension fa"
Expand Down
3 changes: 3 additions & 0 deletions conf/test_full.config
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,7 @@ params {
megahit_fix_cpu_1 = true
// available options to enable reproducibility for BUSCO (--busco_download_path or --busco_reference) not used here
// to allow detection of possible problems in automated lineage selection mode using public databases

// test CAT with official taxonomic ranks only
cat_official_taxonomy = true
}
5 changes: 3 additions & 2 deletions modules/local/cat.nf
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,11 @@ process CAT {
path "versions.yml" , emit: versions

script:
def official_taxonomy = params.cat_official_taxonomy ? "--official_taxonomy" : ""
"""
CAT bins -b "bins/" -d database/ -t taxonomy/ -n "${task.cpus}" -s .fa --top 6 -o "${meta.assembler}-${meta.binner}-${meta.id}" --I_know_what_Im_doing
CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.names.txt" -t taxonomy/
CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.names.txt" -t taxonomy/
CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.names.txt" -t taxonomy/ ${official_taxonomy}
CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.names.txt" -t taxonomy/ ${official_taxonomy}

mkdir raw
mv *.ORF2LCA.txt *.predicted_proteins.faa *.predicted_proteins.gff *.log *.bin2classification.txt raw/
Expand Down
29 changes: 29 additions & 0 deletions modules/local/cat_summary.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
process CAT_SUMMARY {
label 'process_low'

conda (params.enable_conda ? "bioconda::bioawk=1.0" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/bioawk:1.0--hed695b0_5' :
'quay.io/biocontainers/bioawk:1.0--hed695b0_5' }"

input:
path(cat_summaries)

output:
path("*.tsv") , emit: combined
path "versions.yml", emit: versions

script:
def prefix = task.ext.prefix ?: "cat_summary"
"""
# use find as sometimes these are empty and need to fail gracefully
find -L -type f -name "*bin2classification.names.txt.gz" -exec sh -c 'for f do gunzip -c \$f > \${f%.*}; done' find-sh {} +

bioawk '(NR == 1) || (FNR > 1)' *.txt > ${prefix}.tsv

cat <<-END_VERSIONS > versions.yml
"${task.process}":
bioawk: \$(bioawk --version | cut -f 3 -d ' ' )
END_VERSIONS
"""
}
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ params {
skip_krona = false
cat_db = null
cat_db_generate = false
cat_official_taxonomy = false
save_cat_db = false
gtdb = "https://data.ace.uq.edu.au/public/gtdb/data/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz"
gtdbtk_min_completeness = 50.0
Expand Down
6 changes: 5 additions & 1 deletion nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,10 @@
"description": "Save the CAT database generated when specified by `--cat_db_generate`.",
"help_text": "Useful to allow reproducibility, as old versions of prebuild CAT databases do not always remain accessible and underlying NCBI taxonomy and nr databases change."
},
"cat_official_taxonomy": {
"type": "boolean",
"description": "Only return official taxonomic ranks (Kingdom, Phylum, etc.) when running CAT."
},
"gtdb": {
"type": "string",
"default": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz",
Expand Down Expand Up @@ -641,7 +645,7 @@
"type": "number",
"default": 0.5,
"description": "Specify single-copy gene score threshold for bin refinement.",
"help_text": "Score threshold for single-copy gene selection algorithm to keep selecting bins, with a value ranging from 0-1.\n\nFor description of scoring algorithm, see: Sieber, Christian M. K., et al. 2018. Nature Microbiology 3 (7): 836\u201343. https://doi.org/10.1038/s41564-018-0171-1.\n\n> Modifies DAS Tool parameter --score_threshold\n"
"help_text": "Score threshold for single-copy gene selection algorithm to keep selecting bins, with a value ranging from 0-1.\n\nFor description of scoring algorithm, see: Sieber, Christian M. K., et al. 2018. Nature Microbiology 3 (7): 836–43. https://doi.org/10.1038/s41564-018-0171-1.\n\n> Modifies DAS Tool parameter --score_threshold\n"
},
"postbinning_input": {
"type": "string",
Expand Down
5 changes: 5 additions & 0 deletions workflows/mag.nf
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ include { QUAST_BINS_SUMMARY } from '../modules
include { CAT_DB } from '../modules/local/cat_db'
include { CAT_DB_GENERATE } from '../modules/local/cat_db_generate'
include { CAT } from '../modules/local/cat'
include { CAT_SUMMARY } from "../modules/local/cat_summary"
include { BIN_SUMMARY } from '../modules/local/bin_summary'
include { COMBINE_TSV } from '../modules/local/combine_tsv'
include { MULTIQC } from '../modules/local/multiqc'
Expand Down Expand Up @@ -640,7 +641,11 @@ workflow MAG {
ch_input_for_postbinning_bins,
ch_cat_db
)
CAT_SUMMARY(
CAT.out.tax_classification.collect()
)
ch_versions = ch_versions.mix(CAT.out.versions.first())
ch_versions = ch_versions.mix(CAT_SUMMARY.out.versions.first())

/*
* GTDB-tk: taxonomic classifications using GTDB reference
Expand Down