Skip to content

Commit

Permalink
Merge pull request #366 from prototaxites/cat_summarise
Browse files Browse the repository at this point in the history
Add CAT_SUMMARY process and offical_taxonomy param
  • Loading branch information
d4straub authored Dec 16, 2022
2 parents c3a0ef3 + 7b0adf6 commit 9ad1729
Show file tree
Hide file tree
Showing 8 changed files with 55 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#353](https://github.com/nf-core/mag/pull/353) - Added the busco_clean parameter to optionally clean each BUSCO directory after a successful
- [#361](https://github.com/nf-core/mag/pull/361) - Added the skip_clipping parameter to skip read preprocessing with fastp or adapterremoval. Running the pipeline with skip_clipping, keep_phix and without specifying a host genome or fasta file skips the FASTQC_TRIMMED process.
- [#365](https://github.com/nf-core/mag/pull/365) - Adds CONCOCT as an additional (optional) binning tool
- [#366](https://github.com/nf-core/mag/pull/366) - Added CAT_SUMMARISE process and cat_official_taxonomy parameter.

### `Changed`

Expand Down
8 changes: 8 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,14 @@ process {
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}
withName: CAT_SUMMARY {
ext.prefix = "cat_summary"
publishDir = [
path: { "${params.outdir}/Taxonomy/CAT/" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
]
}

withName: GTDBTK_CLASSIFY {
ext.args = "--extension fa"
Expand Down
3 changes: 3 additions & 0 deletions conf/test_full.config
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,7 @@ params {
megahit_fix_cpu_1 = true
// available options to enable reproducibility for BUSCO (--busco_download_path or --busco_reference) not used here
// to allow detection of possible problems in automated lineage selection mode using public databases

// test CAT with official taxonomic ranks only
cat_official_taxonomy = true
}
5 changes: 3 additions & 2 deletions modules/local/cat.nf
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,11 @@ process CAT {
path "versions.yml" , emit: versions

script:
def official_taxonomy = params.cat_official_taxonomy ? "--official_taxonomy" : ""
"""
CAT bins -b "bins/" -d database/ -t taxonomy/ -n "${task.cpus}" -s .fa --top 6 -o "${meta.assembler}-${meta.binner}-${meta.id}" --I_know_what_Im_doing
CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.names.txt" -t taxonomy/
CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.names.txt" -t taxonomy/
CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.ORF2LCA.names.txt" -t taxonomy/ ${official_taxonomy}
CAT add_names -i "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.txt" -o "${meta.assembler}-${meta.binner}-${meta.id}.bin2classification.names.txt" -t taxonomy/ ${official_taxonomy}
mkdir raw
mv *.ORF2LCA.txt *.predicted_proteins.faa *.predicted_proteins.gff *.log *.bin2classification.txt raw/
Expand Down
29 changes: 29 additions & 0 deletions modules/local/cat_summary.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
process CAT_SUMMARY {
label 'process_low'

conda (params.enable_conda ? "bioconda::bioawk=1.0" : null)
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/bioawk:1.0--hed695b0_5' :
'quay.io/biocontainers/bioawk:1.0--hed695b0_5' }"

input:
path(cat_summaries)

output:
path("*.tsv") , emit: combined
path "versions.yml", emit: versions

script:
def prefix = task.ext.prefix ?: "cat_summary"
"""
# use find as sometimes these are empty and need to fail gracefully
find -L -type f -name "*bin2classification.names.txt.gz" -exec sh -c 'for f do gunzip -c \$f > \${f%.*}; done' find-sh {} +
bioawk '(NR == 1) || (FNR > 1)' *.txt > ${prefix}.tsv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
bioawk: \$(bioawk --version | cut -f 3 -d ' ' )
END_VERSIONS
"""
}
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ params {
skip_krona = false
cat_db = null
cat_db_generate = false
cat_official_taxonomy = false
save_cat_db = false
gtdb = "https://data.ace.uq.edu.au/public/gtdb/data/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz"
gtdbtk_min_completeness = 50.0
Expand Down
6 changes: 5 additions & 1 deletion nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,10 @@
"description": "Save the CAT database generated when specified by `--cat_db_generate`.",
"help_text": "Useful to allow reproducibility, as old versions of prebuild CAT databases do not always remain accessible and underlying NCBI taxonomy and nr databases change."
},
"cat_official_taxonomy": {
"type": "boolean",
"description": "Only return official taxonomic ranks (Kingdom, Phylum, etc.) when running CAT."
},
"gtdb": {
"type": "string",
"default": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz",
Expand Down Expand Up @@ -641,7 +645,7 @@
"type": "number",
"default": 0.5,
"description": "Specify single-copy gene score threshold for bin refinement.",
"help_text": "Score threshold for single-copy gene selection algorithm to keep selecting bins, with a value ranging from 0-1.\n\nFor description of scoring algorithm, see: Sieber, Christian M. K., et al. 2018. Nature Microbiology 3 (7): 836\u201343. https://doi.org/10.1038/s41564-018-0171-1.\n\n> Modifies DAS Tool parameter --score_threshold\n"
"help_text": "Score threshold for single-copy gene selection algorithm to keep selecting bins, with a value ranging from 0-1.\n\nFor description of scoring algorithm, see: Sieber, Christian M. K., et al. 2018. Nature Microbiology 3 (7): 836–43. https://doi.org/10.1038/s41564-018-0171-1.\n\n> Modifies DAS Tool parameter --score_threshold\n"
},
"postbinning_input": {
"type": "string",
Expand Down
5 changes: 5 additions & 0 deletions workflows/mag.nf
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ include { QUAST_BINS_SUMMARY } from '../modules
include { CAT_DB } from '../modules/local/cat_db'
include { CAT_DB_GENERATE } from '../modules/local/cat_db_generate'
include { CAT } from '../modules/local/cat'
include { CAT_SUMMARY } from "../modules/local/cat_summary"
include { BIN_SUMMARY } from '../modules/local/bin_summary'
include { COMBINE_TSV } from '../modules/local/combine_tsv'
include { MULTIQC } from '../modules/local/multiqc'
Expand Down Expand Up @@ -640,7 +641,11 @@ workflow MAG {
ch_input_for_postbinning_bins,
ch_cat_db
)
CAT_SUMMARY(
CAT.out.tax_classification.collect()
)
ch_versions = ch_versions.mix(CAT.out.versions.first())
ch_versions = ch_versions.mix(CAT_SUMMARY.out.versions.first())

/*
* GTDB-tk: taxonomic classifications using GTDB reference
Expand Down

0 comments on commit 9ad1729

Please sign in to comment.