From 5b7b09babd8f2ca18a3cce5fea6d043102c2358d Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Tue, 27 Feb 2024 22:39:19 +0100 Subject: [PATCH 01/39] add mmseqs modules --- conf/modules.config | 13 ++++ modules.json | 20 ++++++ .../nf-core/mmseqs/createdb/environment.yml | 7 ++ modules/nf-core/mmseqs/createdb/main.nf | 65 +++++++++++++++++++ modules/nf-core/mmseqs/createdb/meta.yml | 47 ++++++++++++++ .../nf-core/mmseqs/createtsv/environment.yml | 7 ++ modules/nf-core/mmseqs/createtsv/main.nf | 63 ++++++++++++++++++ modules/nf-core/mmseqs/createtsv/meta.yml | 65 +++++++++++++++++++ .../nf-core/mmseqs/databases/environment.yml | 7 ++ modules/nf-core/mmseqs/databases/main.nf | 62 ++++++++++++++++++ modules/nf-core/mmseqs/databases/meta.yml | 33 ++++++++++ .../nf-core/mmseqs/taxonomy/environment.yml | 9 +++ modules/nf-core/mmseqs/taxonomy/main.nf | 65 +++++++++++++++++++ modules/nf-core/mmseqs/taxonomy/meta.yml | 48 ++++++++++++++ nextflow.config | 7 ++ workflows/funcscan.nf | 44 +++++++++++++ 16 files changed, 562 insertions(+) create mode 100644 modules/nf-core/mmseqs/createdb/environment.yml create mode 100644 modules/nf-core/mmseqs/createdb/main.nf create mode 100644 modules/nf-core/mmseqs/createdb/meta.yml create mode 100644 modules/nf-core/mmseqs/createtsv/environment.yml create mode 100644 modules/nf-core/mmseqs/createtsv/main.nf create mode 100644 modules/nf-core/mmseqs/createtsv/meta.yml create mode 100644 modules/nf-core/mmseqs/databases/environment.yml create mode 100644 modules/nf-core/mmseqs/databases/main.nf create mode 100644 modules/nf-core/mmseqs/databases/meta.yml create mode 100644 modules/nf-core/mmseqs/taxonomy/environment.yml create mode 100644 modules/nf-core/mmseqs/taxonomy/main.nf create mode 100644 modules/nf-core/mmseqs/taxonomy/meta.yml diff --git a/conf/modules.config b/conf/modules.config index aad4d949..669f8402 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -52,6 +52,19 @@ process { ext.prefix = { "${meta.id}.fa" } } + withName: MMSEQS_DATABASES { + publishDir = [ + path: { "${params.outdir}/databases/mmseqs_taxonomy" }, + mode: params.publish_dir_mode, + enabled: params.save_databases, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + ext.args = [ + params.classify_taxonomy_mmseqs_db_savetmp ? "" : "--remove-tmp-files" , + ].join(' ').trim() + } + + withName: PROKKA { publishDir = [ path: { "${params.outdir}/annotation/prokka/" }, diff --git a/modules.json b/modules.json index 71a31800..fc89ae2c 100644 --- a/modules.json +++ b/modules.json @@ -141,6 +141,26 @@ "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, + "mmseqs/createdb": { + "branch": "master", + "git_sha": "18a43d316b6fd683dc2346867b42882b99811cfd", + "installed_by": ["modules"] + }, + "mmseqs/createtsv": { + "branch": "master", + "git_sha": "151460db852d636979d9ff3ee631e2268060d4c3", + "installed_by": ["modules"] + }, + "mmseqs/databases": { + "branch": "master", + "git_sha": "151460db852d636979d9ff3ee631e2268060d4c3", + "installed_by": ["modules"] + }, + "mmseqs/taxonomy": { + "branch": "master", + "git_sha": "8455be677998258bf40ab3be550c6a96f456cc23", + "installed_by": ["modules"] + }, "multiqc": { "branch": "master", "git_sha": "8ec825f465b9c17f9d83000022995b4f7de6fe93", diff --git a/modules/nf-core/mmseqs/createdb/environment.yml b/modules/nf-core/mmseqs/createdb/environment.yml new file mode 100644 index 00000000..77b28f59 --- /dev/null +++ b/modules/nf-core/mmseqs/createdb/environment.yml @@ -0,0 +1,7 @@ +name: mmseqs_createdb +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::mmseqs2=15.6f452 diff --git a/modules/nf-core/mmseqs/createdb/main.nf b/modules/nf-core/mmseqs/createdb/main.nf new file mode 100644 index 00000000..9487e5bc --- /dev/null +++ b/modules/nf-core/mmseqs/createdb/main.nf @@ -0,0 +1,65 @@ +process MMSEQS_CREATEDB { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mmseqs2:15.6f452--pl5321h6a68c12_0': + 'biocontainers/mmseqs2:15.6f452--pl5321h6a68c12_0' }" + + input: + tuple val(meta), path(sequence) + + output: + tuple val(meta), path("${prefix}/"), emit: db + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def is_compressed = sequence.getExtension() == "gz" ? true : false + def sequence_name = is_compressed ? sequence.getBaseName() : sequence + """ + if [ "${is_compressed}" == "true" ]; then + gzip -c -d ${sequence} > ${sequence_name} + fi + + mkdir -p ${prefix} + + mmseqs \\ + createdb \\ + ${sequence_name} \\ + ${prefix}/${prefix} \\ + $args \\ + --compressed 1 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir -p ${prefix} + + touch ${prefix}/${prefix} + touch ${prefix}/${prefix}.dbtype + touch ${prefix}/${prefix}.index + touch ${prefix}/${prefix}.lookup + touch ${prefix}/${prefix}.source + touch ${prefix}/${prefix}_h + touch ${prefix}/${prefix}_h.dbtype + touch ${prefix}/${prefix}_h.index + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/mmseqs/createdb/meta.yml b/modules/nf-core/mmseqs/createdb/meta.yml new file mode 100644 index 00000000..a011020b --- /dev/null +++ b/modules/nf-core/mmseqs/createdb/meta.yml @@ -0,0 +1,47 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "mmseqs_createdb" +description: Create an MMseqs database from an existing FASTA/Q file +keywords: + - protein sequence + - databases + - clustering + - searching + - indexing + - mmseqs2 +tools: + - "mmseqs": + description: "MMseqs2: ultra fast and sensitive sequence search and clustering suite" + homepage: "https://github.com/soedinglab/MMseqs2" + documentation: "https://mmseqs.com/latest/userguide.pdf" + tool_dev_url: "https://github.com/soedinglab/MMseqs2" + doi: "10.1093/bioinformatics/btw006" + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - sequence: + type: file + description: Input sequences in FASTA/Q (zipped or unzipped) format to parse into an mmseqs database + pattern: "*.{fasta,fasta.gz,fa,fa.gz,fna,fna.gz,fastq,fastq.gz,fq,fq.gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - db: + type: directory + description: The created MMseqs2 database + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Joon-Klaps" +maintainers: + - "@Joon-Klaps" + - "@vagkaratzas" diff --git a/modules/nf-core/mmseqs/createtsv/environment.yml b/modules/nf-core/mmseqs/createtsv/environment.yml new file mode 100644 index 00000000..4840fc02 --- /dev/null +++ b/modules/nf-core/mmseqs/createtsv/environment.yml @@ -0,0 +1,7 @@ +name: mmseqs_createtsv +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::mmseqs2=15.6f452 diff --git a/modules/nf-core/mmseqs/createtsv/main.nf b/modules/nf-core/mmseqs/createtsv/main.nf new file mode 100644 index 00000000..ee58b10f --- /dev/null +++ b/modules/nf-core/mmseqs/createtsv/main.nf @@ -0,0 +1,63 @@ + +process MMSEQS_CREATETSV { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mmseqs2:15.6f452--pl5321h6a68c12_0': + 'biocontainers/mmseqs2:15.6f452--pl5321h6a68c12_0' }" + + input: + tuple val(meta), path(db_result) + tuple val(meta2), path(db_query) + tuple val(meta3), path(db_target) + + output: + tuple val(meta), path("*.tsv"), emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args ?: "*.dbtype" + def args3 = task.ext.args ?: "*.dbtype" + def args4 = task.ext.args ?: "*.dbtype" + def prefix = task.ext.prefix ?: "${meta.id}" + db_target = db_target ?: "${db_query}" // optional argument db_target as in many cases, it's the same as db_query + """ + # Extract files with specified args based suffix | remove suffix | isolate longest common substring of files + DB_RESULT_PATH_NAME=\$(find -L "$db_result/" -maxdepth 1 -name "$args2" | sed 's/\\.[^.]*\$//' | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' ) + DB_QUERY_PATH_NAME=\$(find -L "$db_query/" -maxdepth 1 -name "$args3" | sed 's/\\.[^.]*\$//' | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' ) + DB_TARGET_PATH_NAME=\$(find -L "$db_target/" -maxdepth 1 -name "$args4" | sed 's/\\.[^.]*\$//' | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' ) + + mmseqs \\ + createtsv \\ + \$DB_QUERY_PATH_NAME \\ + \$DB_TARGET_PATH_NAME \\ + \$DB_RESULT_PATH_NAME \\ + ${prefix}.tsv \\ + $args \\ + --threads ${task.cpus} \\ + --compressed 1 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/mmseqs/createtsv/meta.yml b/modules/nf-core/mmseqs/createtsv/meta.yml new file mode 100644 index 00000000..e85b066f --- /dev/null +++ b/modules/nf-core/mmseqs/createtsv/meta.yml @@ -0,0 +1,65 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "mmseqs_createtsv" +description: Create a tsv file from a query and a target database as well as the result database +keywords: + - protein sequence + - databases + - clustering + - searching + - indexing + - mmseqs2 + - tsv +tools: + - "mmseqs": + description: "MMseqs2: ultra fast and sensitive sequence search and clustering suite" + homepage: "https://github.com/soedinglab/MMseqs2" + documentation: "https://mmseqs.com/latest/userguide.pdf" + tool_dev_url: "https://github.com/soedinglab/MMseqs2" + doi: "10.1093/bioinformatics/btw006" + licence: ["GPL v3"] +input: + # Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - db_result: + type: directory + description: an MMseqs2 database with result data + - meta2: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - db_query: + type: directory + description: an MMseqs2 database with query data + - meta3: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - db_target: + type: directory + description: an MMseqs2 database with target data +output: + #Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - tsv: + type: file + description: The resulting tsv file created using the query, target and result MMseqs databases + pattern: "*.{tsv}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Joon-Klaps" +maintainers: + - "@Joon-Klaps" diff --git a/modules/nf-core/mmseqs/databases/environment.yml b/modules/nf-core/mmseqs/databases/environment.yml new file mode 100644 index 00000000..3bf8437d --- /dev/null +++ b/modules/nf-core/mmseqs/databases/environment.yml @@ -0,0 +1,7 @@ +name: mmseqs_databases +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::mmseqs2=15.6f452 diff --git a/modules/nf-core/mmseqs/databases/main.nf b/modules/nf-core/mmseqs/databases/main.nf new file mode 100644 index 00000000..3e228b29 --- /dev/null +++ b/modules/nf-core/mmseqs/databases/main.nf @@ -0,0 +1,62 @@ +process MMSEQS_DATABASES { + tag "${database}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mmseqs2:15.6f452--pl5321h6a68c12_0': + 'biocontainers/mmseqs2:15.6f452--pl5321h6a68c12_0' }" + + input: + val database + + output: + path "${prefix}/" , emit: database + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: 'mmseqs_database' + """ + mkdir ${prefix}/ + + mmseqs databases \\ + ${database} \\ + ${prefix}/database \\ + tmp/ \\ + --threads ${task.cpus} \\ + --compressed 1 \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: 'mmseqs_database' + """ + mkdir ${prefix}/ + + touch ${prefix}/database + touch ${prefix}/database.dbtype + touch ${prefix}/database_h + touch ${prefix}/database_h.dbtype + touch ${prefix}/database_h.index + touch ${prefix}/database.index + touch ${prefix}/database.lookup + touch ${prefix}/database_mapping + touch ${prefix}/database.source + touch ${prefix}/database_taxonomy + touch ${prefix}/database.version + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: /') + END_VERSIONS + """ +} diff --git a/modules/nf-core/mmseqs/databases/meta.yml b/modules/nf-core/mmseqs/databases/meta.yml new file mode 100644 index 00000000..803a87f6 --- /dev/null +++ b/modules/nf-core/mmseqs/databases/meta.yml @@ -0,0 +1,33 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "mmseqs_databases" +description: Download an mmseqs-formatted database +keywords: + - database + - indexing + - clustering + - searching +tools: + - "mmseqs": + description: "MMseqs2: ultra fast and sensitive sequence search and clustering suite" + homepage: "https://github.com/soedinglab/MMseqs2" + documentation: "https://mmseqs.com/latest/userguide.pdf" + tool_dev_url: "https://github.com/soedinglab/MMseqs2" + doi: "10.1093/bioinformatics/btw006" + licence: ["GPL v3"] +input: + - database: + type: string + description: Database available through the mmseqs2 databases interface - see https://github.com/soedinglab/MMseqs2/wiki#downloading-databases for details +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - database: + type: directory + description: Directory containing processed mmseqs database +authors: + - "@prototaxites" +maintainers: + - "@prototaxites" diff --git a/modules/nf-core/mmseqs/taxonomy/environment.yml b/modules/nf-core/mmseqs/taxonomy/environment.yml new file mode 100644 index 00000000..fa40c277 --- /dev/null +++ b/modules/nf-core/mmseqs/taxonomy/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "mmseqs_taxonomy" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::mmseqs2=15.6f452" diff --git a/modules/nf-core/mmseqs/taxonomy/main.nf b/modules/nf-core/mmseqs/taxonomy/main.nf new file mode 100644 index 00000000..54849885 --- /dev/null +++ b/modules/nf-core/mmseqs/taxonomy/main.nf @@ -0,0 +1,65 @@ +process MMSEQS_TAXONOMY { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mmseqs2:15.6f452--pl5321h6a68c12_0': + 'biocontainers/mmseqs2:15.6f452--pl5321h6a68c12_0' }" + + input: + tuple val(meta), path(db_query) + path(db_target) + + output: + tuple val(meta), path("${prefix}_taxonomy"), emit: db_taxonomy + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: "*.dbtype" //represents the db_query + def args3 = task.ext.args3 ?: "*.dbtype" //represents the db_target + prefix = task.ext.prefix ?: "${meta.id}" + + """ + mkdir -p ${prefix}_taxonomy + + # Extract files with specified args based suffix | remove suffix | isolate longest common substring of files + DB_QUERY_PATH_NAME=\$(find -L "${db_query}/" -maxdepth 1 -name "${args2}" | sed 's/\\.[^.]*\$//' | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' ) + DB_TARGET_PATH_NAME=\$(find -L "${db_target}/" -maxdepth 1 -name "${args3}" | sed 's/\\.[^.]*\$//' | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' ) + + mmseqs \\ + taxonomy \\ + \$DB_QUERY_PATH_NAME \\ + \$DB_TARGET_PATH_NAME \\ + ${prefix}_taxonomy/${prefix} \\ + tmp1 \\ + $args \\ + --threads ${task.cpus} \\ + --compressed 1 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + + """ + mkdir -p ${prefix}_taxonomy + touch ${prefix}_taxonomy/${prefix}.{0..25} + touch ${prefix}_taxonomy/${prefix}.dbtype + touch ${prefix}_taxonomy/${prefix}.index + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/mmseqs/taxonomy/meta.yml b/modules/nf-core/mmseqs/taxonomy/meta.yml new file mode 100644 index 00000000..d836029c --- /dev/null +++ b/modules/nf-core/mmseqs/taxonomy/meta.yml @@ -0,0 +1,48 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "mmseqs_taxonomy" +description: Computes the lowest common ancestor by identifying the query sequence homologs against the target database. +keywords: + - protein sequence + - nucleotide sequence + - databases + - taxonomy + - homologs + - mmseqs2 +tools: + - "mmseqs": + description: "MMseqs2: ultra fast and sensitive sequence search and clustering suite" + homepage: "https://github.com/soedinglab/MMseqs2" + documentation: "https://mmseqs.com/latest/userguide.pdf" + tool_dev_url: "https://github.com/soedinglab/MMseqs2" + doi: "10.1093/bioinformatics/btw006" + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - db_query: + type: directory + description: An MMseqs2 database with query data + - db_target: + type: directory + description: an MMseqs2 database with target data including the taxonomy classification +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - db_taxonomy: + type: directory + description: An MMseqs2 database with target data including the taxonomy classification + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@darcy220606" +maintainers: + - "@darcy220606" diff --git a/nextflow.config b/nextflow.config index b15a01df..5387a34f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -17,6 +17,13 @@ params { igenomes_base = 's3://ngi-igenomes/igenomes/' igenomes_ignore = false + // Taxonomy classification options + classify_taxonomy = true + + classify_taxonomy_mmseqs_db_localpath = null + classify_taxonomy_mmseqs_db = 'SILVA' + classify_taxonomy_mmseqs_db_savetmp = false + // Annotation options annotation_tool = 'pyrodigal' diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 64bedecc..92c60190 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -91,6 +91,10 @@ include { PRODIGAL as PRODIGAL_GBK } from '../modules/nf-core/prodigal/ include { PYRODIGAL } from '../modules/nf-core/pyrodigal/main' include { BAKTA_BAKTADBDOWNLOAD } from '../modules/nf-core/bakta/baktadbdownload/main' include { BAKTA_BAKTA } from '../modules/nf-core/bakta/bakta/main' +include { MMSEQS_CREATEDB } from '../modules/nf-core/mmseqs/createdb/main' +include { MMSEQS_DATABASES } from '../modules/nf-core/mmseqs/databases/main' +include { MMSEQS_TAXONOMY } from '../modules/nf-core/mmseqs/taxonomy/main' +include { MMSEQS_CREATETSV } from '../modules/nf-core/mmseqs/createtsv/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -136,6 +140,46 @@ workflow FUNCSCAN { [ meta, fasta ] } + /* + TAXONOMIC CLASSIFICATION + */ + // The final subworkflow reports need taxonomic classification + // This can be either on NT or AA level depending on annotation + // NOTE: (AA tax. classification will be added only when its PR is merged - NOW - only on NT) + //TODO RUN MMSEQS/database /create db and taxonomy and converttsv / and grab teh output table + if ( params.classify_taxonomy == true ) { + + // Download the ref db if not supplied by user + if ( params.classify_taxonomy_mmseqs_db_localpath ) { + ch_mmseqs_db = Channel + .fromPath( params.classify_taxonomy_mmseqs_db_localpath ) + .first() + } else { + MMSEQS_DATABASES ( params.classify_taxonomy_mmseqs_db ) + ch_versions = ch_versions.mix( MMSEQS_DATABASES.out.versions ) + ch_mmseqs_db = ( MMSEQS_DATABASES.out.database ) + } + + // Create db for query contigs, assign taxonomy and convert to table format + MMSEQS_CREATEDB ( ch_prepped_input ) + ch_versions = ch_versions.mix(MMSEQS_CREATEDB.out.versions) + ch_taxonomy_querydb = MMSEQS_CREATEDB.out.db + MMSEQS_TAXONOMY ( ch_taxonomy_querydb, ch_mmseqs_db ) + ch_versions = ch_versions.mix(MMSEQS_TAXONOMY.out.versions) + ch_taxonomy_querydb_taxdb = MMSEQS_TAXONOMY.out.db + MMSEQS_CREATETSV ( ch_taxonomy_querydb, ch_taxonomy_querydb_taxdb, [[:],[]] ) + ch_versions = ch_versions.mix(MMSEQS_CREATETSV.out.versions) + ch_taxonomy_tsv = MMSEQS_CREATETSV.out.tsv + + } else { + + ch_mmseqs_db = Channel.empty() + ch_taxonomy_querydb = Channel.empty() + ch_taxonomy_querydb_taxdb = Channel.empty() + ch_taxonomy_tsv = Channel.empty() + + } + /* ANNOTATION */ From 6ce2f6712df52362bb29a0be07c5cb5a3eb38fe5 Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Wed, 28 Feb 2024 13:26:32 +0100 Subject: [PATCH 02/39] Add all parametrs necessary in config files --- conf/modules.config | 40 ++++++++++++++++++- nextflow.config | 20 +++++++--- nextflow_schema.json | 90 +++++++++++++++++++++++++++++++++++++------ workflows/funcscan.nf | 2 +- 4 files changed, 133 insertions(+), 19 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 669f8402..0faf5f7c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -54,16 +54,52 @@ process { withName: MMSEQS_DATABASES { publishDir = [ - path: { "${params.outdir}/databases/mmseqs_taxonomy" }, + path: { "${params.outdir}/databases/" }, mode: params.publish_dir_mode, enabled: params.save_databases, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] ext.args = [ - params.classify_taxonomy_mmseqs_db_savetmp ? "" : "--remove-tmp-files" , + params.taxonomy_mmseqs_databases_savetmp ? "" : "--remove-tmp-files" , ].join(' ').trim() } + withName: MMSEQS_CREATEDB { + publishDir = [ + path: { "${params.outdir}/taxonomy/mmseqs_createdb/" }, + mode: params.publish_dir_mode, + enabled: params.taxonomy_mmseqs_save_intermedfiles, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: MMSEQS_TAXONOMY { + publishDir = [ + path: { "${params.outdir}/taxonomy/mmseqs_taxonomy/" }, + mode: params.publish_dir_mode, + enabled: params.taxonomy_mmseqs_save_intermedfiles, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + ext.args = [ + params.taxonomy_mmseqs_taxonomy_savetmp ? "" : "--remove-tmp-files", + "--search-type ${params.taxonomy_mmseqs_taxonomy_searchtype}", + "--lca-ranks ${params.taxonomy_mmseqs_taxonomy_lcaranks}", + "--tax-lineage ${params.taxonomy_mmseqs_taxonomy_taxlineage}", + "-s ${params.taxonomy_mmseqs_taxonomy_sensitivity}", + "--orf-filter-s ${params.taxonomy_mmseqs_taxonomy_orffilters}", + "--lca-mode ${params.taxonomy_mmseqs_taxonomy_lcamode}", + "--vote-mode ${params.taxonomy_mmseqs_taxonomy_votemode}", + "--majority ${params.taxonomy_mmseqs_taxonomy_majority}" + ].join(' ').trim() + } + + withName: MMSEQS_CREATETSV { + publishDir = [ + path: { "${params.outdir}/taxonomy/mmseqs_createtsv/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } withName: PROKKA { publishDir = [ diff --git a/nextflow.config b/nextflow.config index 5387a34f..edcaa6ca 100644 --- a/nextflow.config +++ b/nextflow.config @@ -19,11 +19,21 @@ params { // Taxonomy classification options classify_taxonomy = true - - classify_taxonomy_mmseqs_db_localpath = null - classify_taxonomy_mmseqs_db = 'SILVA' - classify_taxonomy_mmseqs_db_savetmp = false - + taxonomy_mmseqs_save_intermedfiles = false + + taxonomy_mmseqs_databases_localpath = null + taxonomy_mmseqs_databases_id = 'SILVA' + taxonomy_mmseqs_databases_savetmp = false + + taxonomy_mmseqs_taxonomy_savetmp = false + taxonomy_mmseqs_taxonomy_searchtype = 2 + taxonomy_mmseqs_taxonomy_lcaranks = 'kingdom,phylum,class,order,family,genus,species' + taxonomy_mmseqs_taxonomy_taxlineage = 1 + taxonomy_mmseqs_taxonomy_sensitivity = 5.0 + taxonomy_mmseqs_taxonomy_orffilters = 2.0 + taxonomy_mmseqs_taxonomy_lcamode = 3 + taxonomy_mmseqs_taxonomy_votemode = 1 + taxonomy_mmseqs_taxonomy_majority = 0.5 // Annotation options annotation_tool = 'pyrodigal' diff --git a/nextflow_schema.json b/nextflow_schema.json index c0b5f623..79fafdac 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -68,6 +68,23 @@ }, "fa_icon": "fas fa-network-wired" }, + "taxonomy": { + "title": "Taxonomy", + "type": "object", + "description": "These options influence whether to activate the taxonomic classification of contigs", + "default": "", + "fa_icon": "fas fa-address-book", + "properties": { + "classify_taxonomy": { + "type": "boolean", + "default": true, + "description": "Activates the taxonomic classification of contigs" + }, + "taxonomy_mmseqs_save_intermedfiles": { + "type": "boolean" + } + } + }, "annotation": { "title": "Annotation", "type": "object", @@ -87,7 +104,8 @@ "fa_icon": "fas fa-save" } }, - "fa_icon": "fas fa-file-signature" + "fa_icon": "fas fa-file-signature", + "help_text": "" }, "annotation_bakta": { "title": "Annotation: BAKTA", @@ -259,7 +277,7 @@ "default": "Bacteria", "fa_icon": "fab fa-accusoft", "description": "Specify the kingdom that the input represents.", - "help_text": "Specifies the kingdom that the input sample is derived from and/or you wish to screen for\n\n> ⚠️ Prokka cannot annotate Eukaryotes.\n\nFor more information please check Prokka [documentation](https://github.com/tseemann/prokka).\n\n> Modifies tool parameter(s):\n> - Prokka: `--kingdom`", + "help_text": "Specifies the kingdom that the input sample is derived from and/or you wish to screen for\n\n> \u26a0\ufe0f Prokka cannot annotate Eukaryotes.\n\nFor more information please check Prokka [documentation](https://github.com/tseemann/prokka).\n\n> Modifies tool parameter(s):\n> - Prokka: `--kingdom`", "enum": ["Archaea", "Bacteria", "Mitochondria", "Viruses"] }, "annotation_prokka_gcode": { @@ -280,7 +298,7 @@ }, "annotation_prokka_evalue": { "type": "number", - "default": 0.000001, + "default": 1e-6, "description": "Minimum e-value cut-off.", "help_text": "Specifiy the minimum e-value used for filtering the alignment hits.\n\nFor more information please check Prokka [documentation](https://github.com/tseemann/prokka).\n\n> Modifies tool parameter(s):\n> - Prokka: `--evalue`", "fa_icon": "fas fa-sort-amount-down" @@ -528,7 +546,7 @@ }, "amp_ampcombi_cutoff": { "type": "number", - "default": 0.0, + "default": 0, "description": "Specify probability cutoff to filter AMPs", "help_text": "Specify the minimum probability an AMP hit must have to be retained in the final output file. Anything below this threshold will be removed.\n\nFor more information check AMPcombi [documentation](https://github.com/Darcy220606/AMPcombi).\n\n> Modifies tool parameter(s):\n> - AMPCOMBI: `--cutoff`", "fa_icon": "fas fa-sort-amount-up" @@ -557,7 +575,7 @@ }, "arg_amrfinderplus_identmin": { "type": "number", - "default": -1.0, + "default": -1, "help_text": "Specify the minimum percentage amino-acid identity to reference protein or nucleotide identity for nucleotide reference must have if a BLAST alignment (based on methods: BLAST or PARTIAL) was detected, otherwise NA.\n\n If you specify `-1`, this means use a curated threshold if it exists and `0.9` otherwise.\n\nSetting this value to something other than `-1` will override any curated similarity cutoffs. For BLAST: alignment is > 90% of length and > 90% identity to a protein in the AMRFinderPlus database. For PARTIAL: alignment is > 50% of length, but < 90% of length and > 90% identity to the reference, and does not end at a contig boundary.\n\nFor more information check AMRFinderPlus [documentation](https://github.com/ncbi/amr/wiki/Running-AMRFinderPlus#--organism-option).\n\n> Modifies tool parameter(s):\n> - AMRFinderPlus: `--ident_min`", "description": "Minimum percent identity to reference sequence.", "fa_icon": "fas fa-angle-left" @@ -865,7 +883,7 @@ "default": 1000, "description": "Minimum longest-contig length a sample must have to be screened with antiSMASH.", "fa_icon": "fas fa-ruler-horizontal", - "help_text": "This specifies the minimum length that the longest contig must have for the entire sample to be screened by antiSMASH.\n\nAny samples that do not reach this length will be not be sent to antiSMASH, therefore you will not receive output for these samples in your `--outdir`.\n\n> ⚠️ This is not the same as `--bgc_antismash_contigminlength`, which specifies to only analyse contigs above that threshold but _within_ a sample that has already passed `--bgc_antismash_sampleminlength` sample filter!" + "help_text": "This specifies the minimum length that the longest contig must have for the entire sample to be screened by antiSMASH.\n\nAny samples that do not reach this length will be not be sent to antiSMASH, therefore you will not receive output for these samples in your `--outdir`.\n\n> \u26a0\ufe0f This is not the same as `--bgc_antismash_contigminlength`, which specifies to only analyse contigs above that threshold but _within_ a sample that has already passed `--bgc_antismash_sampleminlength` sample filter!" }, "bgc_antismash_contigminlength": { "type": "integer", @@ -1033,7 +1051,7 @@ "type": "number", "description": "The p-value cutoff for protein domains to be included.", "fa_icon": "fas fa-filter", - "default": 0.000000001, + "default": 1e-9, "help_text": "The p-value cutoff for protein domains to be included.\n\nFor more information see the GECCO [documentation](https://github.com/zellerlab/GECCO).\n\n> Modifies tool parameter(s):\n> - GECCO: `--pfilter`" }, "bgc_gecco_threshold": { @@ -1337,15 +1355,15 @@ } }, "allOf": [ - { - "$ref": "#/definitions/annotation_pyrodigal" - }, { "$ref": "#/definitions/input_output_options" }, { "$ref": "#/definitions/screening_type_activation" }, + { + "$ref": "#/definitions/taxonomy" + }, { "$ref": "#/definitions/annotation" }, @@ -1358,6 +1376,9 @@ { "$ref": "#/definitions/annotation_prodigal" }, + { + "$ref": "#/definitions/annotation_pyrodigal" + }, { "$ref": "#/definitions/database_downloading_options" }, @@ -1418,5 +1439,52 @@ { "$ref": "#/definitions/generic_options" } - ] + ], + "properties": { + "taxonomy_mmseqs_databases_localpath": { + "type": "string" + }, + "taxonomy_mmseqs_databases_id": { + "type": "string", + "default": "SILVA" + }, + "taxonomy_mmseqs_databases_savetmp": { + "type": "boolean" + }, + "taxonomy_mmseqs_taxonomy_savetmp": { + "type": "boolean" + }, + "taxonomy_mmseqs_taxonomy_searchtype": { + "type": "integer", + "default": 2 + }, + "taxonomy_mmseqs_taxonomy_lcaranks": { + "type": "string", + "default": "kingdom,phylum,class,order,family,genus,species" + }, + "taxonomy_mmseqs_taxonomy_taxlineage": { + "type": "integer", + "default": 1 + }, + "taxonomy_mmseqs_taxonomy_sensitivity": { + "type": "integer", + "default": 5 + }, + "taxonomy_mmseqs_taxonomy_orffilters": { + "type": "integer", + "default": 2 + }, + "taxonomy_mmseqs_taxonomy_lcamode": { + "type": "integer", + "default": 3 + }, + "taxonomy_mmseqs_taxonomy_votemode": { + "type": "integer", + "default": 1 + }, + "taxonomy_mmseqs_taxonomy_majority": { + "type": "number", + "default": 0.5 + } + } } diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 92c60190..3efcb48a 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -155,7 +155,7 @@ workflow FUNCSCAN { .fromPath( params.classify_taxonomy_mmseqs_db_localpath ) .first() } else { - MMSEQS_DATABASES ( params.classify_taxonomy_mmseqs_db ) + MMSEQS_DATABASES ( params.taxonomy_mmseqs_databases_id ) ch_versions = ch_versions.mix( MMSEQS_DATABASES.out.versions ) ch_mmseqs_db = ( MMSEQS_DATABASES.out.database ) } From d864eeae894d0e17ba562ff46b5a496de395c88c Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Wed, 28 Feb 2024 16:23:01 +0100 Subject: [PATCH 03/39] Add parametrs to schema version1 --- nextflow_schema.json | 4 +++- workflows/funcscan.nf | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 79fafdac..27f053fd 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1442,7 +1442,9 @@ ], "properties": { "taxonomy_mmseqs_databases_localpath": { - "type": "string" + "type": "string", + "description": "", + "help_text": "" }, "taxonomy_mmseqs_databases_id": { "type": "string", diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 3efcb48a..0488cd1b 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -152,7 +152,7 @@ workflow FUNCSCAN { // Download the ref db if not supplied by user if ( params.classify_taxonomy_mmseqs_db_localpath ) { ch_mmseqs_db = Channel - .fromPath( params.classify_taxonomy_mmseqs_db_localpath ) + .fromPath( params.taxonomy_mmseqs_databases_localpath ) .first() } else { MMSEQS_DATABASES ( params.taxonomy_mmseqs_databases_id ) From 1a6b480aa3dbfe7e9d09c48f4bfe4c03fc2ef36f Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Thu, 29 Feb 2024 12:15:26 +0100 Subject: [PATCH 04/39] update the schema --- conf/modules.config | 1 - nextflow.config | 5 +- nextflow_schema.json | 155 ++++++++++++++++++++++++++++--------------- 3 files changed, 105 insertions(+), 56 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 0faf5f7c..3cb9826f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -89,7 +89,6 @@ process { "--orf-filter-s ${params.taxonomy_mmseqs_taxonomy_orffilters}", "--lca-mode ${params.taxonomy_mmseqs_taxonomy_lcamode}", "--vote-mode ${params.taxonomy_mmseqs_taxonomy_votemode}", - "--majority ${params.taxonomy_mmseqs_taxonomy_majority}" ].join(' ').trim() } diff --git a/nextflow.config b/nextflow.config index edcaa6ca..b5076cd4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -29,11 +29,10 @@ params { taxonomy_mmseqs_taxonomy_searchtype = 2 taxonomy_mmseqs_taxonomy_lcaranks = 'kingdom,phylum,class,order,family,genus,species' taxonomy_mmseqs_taxonomy_taxlineage = 1 - taxonomy_mmseqs_taxonomy_sensitivity = 5.0 - taxonomy_mmseqs_taxonomy_orffilters = 2.0 + taxonomy_mmseqs_taxonomy_sensitivity = '5.0' + taxonomy_mmseqs_taxonomy_orffilters = '2.0' taxonomy_mmseqs_taxonomy_lcamode = 3 taxonomy_mmseqs_taxonomy_votemode = 1 - taxonomy_mmseqs_taxonomy_majority = 0.5 // Annotation options annotation_tool = 'pyrodigal' diff --git a/nextflow_schema.json b/nextflow_schema.json index 27f053fd..088cc41b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -78,13 +78,107 @@ "classify_taxonomy": { "type": "boolean", "default": true, - "description": "Activates the taxonomic classification of contigs" + "description": "Activates the taxonomic classification of contigs", + "fa_icon": "fas fa-ad" }, "taxonomy_mmseqs_save_intermedfiles": { - "type": "boolean" + "type": "boolean", + "fa_icon": "fas fa-file-download" } } }, + "taxonomy_mmseqs_databases": { + "title": "Taxonomy: MMseqs databases", + "type": "object", + "description": "These parameters influence the database to be used in classifying the taxonomy.", + "default": "", + "properties": { + "taxonomy_mmseqs_databases_localpath": { + "type": "string", + "description": "Specifiy a path to MMsqes2 formatted database.", + "help_text": "Specify a path to a database that is prepared in MMseqs2 format as detailed in the [documentation](https://mmseqs.com/latest/userguide.pdf).", + "fa_icon": "fab fa-stackpath" + }, + "taxonomy_mmseqs_databases_id": { + "type": "string", + "default": "SILVA", + "help_text": "Specify the MMseqs2 formatted database to use to classify the input contigs. This can be a nucleotide or amino acid database, however the database chosen must include taxonomic classifications. For example both GTDB, an amico acid database and SILVA, a nucleotide database are both databases provided by MMseqs2 that have taxonomic classifications. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf).\\n\\n> Modifies tool parameter(s):\\n> - mmseqs databases \".", + "description": "Specify the label of the database to be used.", + "fa_icon": "fas fa-address-card" + }, + "taxonomy_mmseqs_databases_savetmp": { + "type": "boolean", + "help_text": "This flag saves in the output folder the temporary files created when downloading the database and creating in the mmseqs2 format. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs databases: `--remove-tmp-files`\".", + "description": "Specify whether the temporary files should be saved.", + "fa_icon": "fas fa-file-download" + } + }, + "fa_icon": "far fa-address-card" + }, + "taxonomy_mmseqs2_taxonomy": { + "title": "Taxonomy: MMseqs2 taxonomy", + "type": "object", + "description": "These parameters influence the taxonomic classification step.", + "default": "", + "properties": { + "taxonomy_mmseqs_taxonomy_savetmp": { + "type": "boolean", + "help_text": "This flag saves in the output folder the temporary files created when creating the taxonomy database and final 'tsv' file. Save More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--remove-tmp-files`\".\n", + "description": "Specify whether to save the temporary files.", + "fa_icon": "fab fa-adversal" + }, + "taxonomy_mmseqs_taxonomy_searchtype": { + "type": "integer", + "default": 2, + "help_text": "Specify the type of alignment to be carried out between the query database and the reference MMseqs2 database. This can be set to '0' for automatic detection, '1' for amino acid alignment, '2' for translating the inputs and running the alignment on the translated sequences, '3' nucleotide based alignment and '4' for the translated nucleotide sequences alignment. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--search-type`\".", + "description": "Specify the alignment type between database and query.", + "fa_icon": "fas fa-align-center" + }, + "taxonomy_mmseqs_taxonomy_lcaranks": { + "type": "string", + "default": "kingdom,phylum,class,order,family,genus,species", + "help_text": "Specify the taxonomic ranks to include in the taxonomic lineage column in the final '.tsv' file. For example, 'kingdom,phylum,class,order,family,genus,species'. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--lca-ranks`\".", + "description": "Specify the taxonomic levels to display in the result table.", + "fa_icon": "fas fa-stream" + }, + "taxonomy_mmseqs_taxonomy_taxlineage": { + "type": "integer", + "default": 1, + "help_text": "This flag specifies whether the taxonomic lineage should be included in the output '.tsv' file. The taxonomic lineage is obtained due to the internal module of mmseqs taxonomy that implements the least common ancestor to classify the taxonomy. A value of '0' writes no taxonomic lineage, a value of '1' adds a column with the full lineage names prefixed with abbreviation of the lineage level, e.g., k_Prokaryotes;p_Bacteroidetes;c_....;o_....;f_....;g_....;s_...., while a value of '2' adds a column with the full NCBI taxids lineage,e.g., 1324;2345;4546;5345. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--tax-lineage`\".", + "description": "Specify whether to include or remove the taxonomic lineage.", + "fa_icon": "fab fa-audible" + }, + "taxonomy_mmseqs_taxonomy_sensitivity": { + "type": "string", + "default": "5.0", + "help_text": "This flag specifies the speed and sensitivity of the taxonomic search. It stands for how many kmers should be produced during the preliminary seeding stage. A very fast search requires a low value e.g., '1.0' and a a very sensitive search requires e.g., '7.0'. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--s`\".", + "description": "Specify the speed and sensitivity for taxonomy assignment.", + "fa_icon": "fas fa-history" + }, + "taxonomy_mmseqs_taxonomy_orffilters": { + "type": "string", + "default": "2.0", + "help_text": "This flag specifies the sensitivity used for prefiltering the query ORF. Before the taxonomy assigning step, mmseqs2 searches the predicted ORFs against the database provided. This value specifies the speed with which the search is carried out. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--orf-filter-s`\".", + "description": "Specify the ORF search speed in the prefilter step.", + "fa_icon": "fas fa-clock" + }, + "taxonomy_mmseqs_taxonomy_lcamode": { + "type": "integer", + "default": 3, + "help_text": "This flag specifies the strategy used for assigning the least common ancestor (LCA). MMseqs2 assigns taxonomy based on an accelerated approximation of the 2bLCA protocol and uses the value of '3'. In this mode the taxonomic assignment is based not only on usual alignment parameters but also considers the taxonomic classification of the LCA. When the value '4' is used the LCA is assigned based on all the equal scoring top hits. If the value '1' is used the LCA assignment is disregarded and the taxonomic assignment is based on usual alignment parameters like evalue and coverage. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--lca-mode`\".", + "description": "Specify the mode to assign the taxonomy.", + "fa_icon": "fas fa-broom" + }, + "taxonomy_mmseqs_taxonomy_votemode": { + "type": "integer", + "default": 1, + "help_text": "This flag assigns the mode value with which the weights are computed. The value of '0' stands for uniform weights of taxonomy assignments, the value of '1' uses the minus log E-value and '2' the actual score. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--vote-mode`\".", + "description": "Specify the weights of the taxonomic assignment.", + "fa_icon": "fas fa-poll" + } + }, + "fa_icon": "fas fa-tag" + }, "annotation": { "title": "Annotation", "type": "object", @@ -1364,6 +1458,12 @@ { "$ref": "#/definitions/taxonomy" }, + { + "$ref": "#/definitions/taxonomy_mmseqs_databases" + }, + { + "$ref": "#/definitions/taxonomy_mmseqs2_taxonomy" + }, { "$ref": "#/definitions/annotation" }, @@ -1439,54 +1539,5 @@ { "$ref": "#/definitions/generic_options" } - ], - "properties": { - "taxonomy_mmseqs_databases_localpath": { - "type": "string", - "description": "", - "help_text": "" - }, - "taxonomy_mmseqs_databases_id": { - "type": "string", - "default": "SILVA" - }, - "taxonomy_mmseqs_databases_savetmp": { - "type": "boolean" - }, - "taxonomy_mmseqs_taxonomy_savetmp": { - "type": "boolean" - }, - "taxonomy_mmseqs_taxonomy_searchtype": { - "type": "integer", - "default": 2 - }, - "taxonomy_mmseqs_taxonomy_lcaranks": { - "type": "string", - "default": "kingdom,phylum,class,order,family,genus,species" - }, - "taxonomy_mmseqs_taxonomy_taxlineage": { - "type": "integer", - "default": 1 - }, - "taxonomy_mmseqs_taxonomy_sensitivity": { - "type": "integer", - "default": 5 - }, - "taxonomy_mmseqs_taxonomy_orffilters": { - "type": "integer", - "default": 2 - }, - "taxonomy_mmseqs_taxonomy_lcamode": { - "type": "integer", - "default": 3 - }, - "taxonomy_mmseqs_taxonomy_votemode": { - "type": "integer", - "default": 1 - }, - "taxonomy_mmseqs_taxonomy_majority": { - "type": "number", - "default": 0.5 - } - } + ] } From 0c56c1f7e1c6562a7764affdc2ce0ff6fdb394eb Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Thu, 29 Feb 2024 13:55:45 +0100 Subject: [PATCH 05/39] add the docs info --- CITATIONS.md | 4 ++++ docs/output.md | 24 +++++++++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/CITATIONS.md b/CITATIONS.md index 84ada6d1..fd346404 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -90,6 +90,10 @@ > Alcock, B. P., Raphenya, A. R., Lau, T., Tsang, K. K., Bouchard, M., Edalatmand, A., Huynh, W., Nguyen, A. V., Cheng, A. A., Liu, S., Min, S. Y., Miroshnichenko, A., Tran, H. K., Werfalli, R. E., Nasir, J. A., Oloni, M., Speicher, D. J., Florescu, A., Singh, B., Faltyn, M., … McArthur, A. G. (2020). CARD 2020: antibiotic resistome surveillance with the comprehensive antibiotic resistance database. Nucleic acids research, 48(D1), D517–D525. [DOI: 10.1093/nar/gkz935](https://doi.org/10.1093/nar/gkz935) +- [MMseqs2](https://doi.org/10.1093bioinformatics/btab184) + + > Mirdita M., Steinegger M., Breitwieser F., Söding J., Levy Karin E. (2021). Fastand sensitive taxonomic assignment to metagenomic contigs, Bioinformatics, 37(18),3029–3031. [DOI: 10.1093/bioinformatics/btab184](https://doi.org/10.1093bioinformatics/btab184) + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/docs/output.md b/docs/output.md index d515d1f5..c04ff5fc 100644 --- a/docs/output.md +++ b/docs/output.md @@ -10,7 +10,7 @@ The output of nf-core/funcscan provides reports for each of the functional group As a general workflow, we recommend to first look at the summary reports ([ARGs](#hamronization), [AMPs](#ampcombi), [BGCs](#combgc)), to get a general overview of what hits have been found across all the tools of each functional group. After which, you can explore the specific output directories of each tool to get more detailed information about each result. The tool-specific output directories also includes the output from the functional annotation steps of either [prokka](https://github.com/tseemann/prokka), [pyrodigal](https://github.com/althonos/pyrodigal), [prodigal](https://github.com/hyattpd/Prodigal), or [Bakta](https://github.com/oschwengers/bakta) if the `--save_annotations` flag was set. -Similarly, all downloaded databases are saved (i.e. from [antiSMASH](https://docs.antismash.secondarymetabolites.org), [AMRFinderPlus](https://www.ncbi.nlm.nih.gov/pathogens/antimicrobial-resistance/AMRFinder), [Bakta](https://github.com/oschwengers/bakta), [DeepARG](https://bitbucket.org/gusphdproj/deeparg-ss/src/master), and/or [AMPcombi](https://github.com/Darcy220606/AMPcombi)) into the output directory `/downloads/` if the `--save_databases` flag was set. +Similarly, all downloaded databases are saved (i.e. from [MMseqs2](https://github.com/soedinglab/MMseqs2), [antiSMASH](https://docs.antismash.secondarymetabolites.org), [AMRFinderPlus](https://www.ncbi.nlm.nih.gov/pathogens/antimicrobial-resistance/AMRFinder), [Bakta](https://github.com/oschwengers/bakta), [DeepARG](https://bitbucket.org/gusphdproj/deeparg-ss/src/master), and/or [AMPcombi](https://github.com/Darcy220606/AMPcombi)) into the output directory `/downloads/` if the `--save_databases` flag was set. Furthermore, for reproducibility, versions of all software used in the run is presented in a [MultiQC](http://multiqc.info) report. @@ -18,6 +18,8 @@ The directories listed below will be created in the results directory (specified ```console results/ +├── taxonomy/ +| ├── mmseqs_createtsv/ ├── annotation/ | ├── bakta/ | ├── prodigal @@ -54,6 +56,10 @@ work/ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes prokaryotic sequence data through the following steps: +Taxonomy classification of contigs with: + + - [MMseqs2](https://github.com/soedinglab/MMseqs2) (default) - for contig taxonomic classification using 2bLCA. + ORF prediction and annotation with any of: - [Pyrodigal](#pyrodigal) (default) – for open reading frame prediction. @@ -93,6 +99,22 @@ Output Summaries: ## Tool details +### Taxonomic classification tool +[MMseqs2](#MMseqs2) + +
+Output files + +- `taxonomy/mmseqs2_createtsv/` + - `/`: + - `*.tsv`: tab seperated table containing the taxonomic lineage of every contig when available + +> Descriptions taken from the [MMseqs2 documentation](https://github.com/soedinglab/MMseqs2/wiki) + +
+ +[MMseqs2](https://github.com/soedinglab/MMseqs2) classifies the taxonomic lineage of contigs based on the least common ancestor. The taxonomic lineage produced is also added to the final workflow summaries to annotate the potential source bacteria of the BGC, AMP, and ARG. + ### Annotation tools [Pyrodigal](#pyrodigal), [Prodigal](#prodigal), [Prokka](#prokka), [Bakta](#bakta) From c3a80ca2c298ede9f80f36eec6fc2d5b18f3155e Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Thu, 29 Feb 2024 16:05:28 +0100 Subject: [PATCH 06/39] working draft --- nextflow.config | 4 ++-- workflows/funcscan.nf | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/nextflow.config b/nextflow.config index b5076cd4..67609d24 100644 --- a/nextflow.config +++ b/nextflow.config @@ -18,11 +18,11 @@ params { igenomes_ignore = false // Taxonomy classification options - classify_taxonomy = true + taxonomy_mmseqs_classification_off = false taxonomy_mmseqs_save_intermedfiles = false taxonomy_mmseqs_databases_localpath = null - taxonomy_mmseqs_databases_id = 'SILVA' + taxonomy_mmseqs_databases_id = 'Kalamari' taxonomy_mmseqs_databases_savetmp = false taxonomy_mmseqs_taxonomy_savetmp = false diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 0488cd1b..5dd0254a 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -146,11 +146,10 @@ workflow FUNCSCAN { // The final subworkflow reports need taxonomic classification // This can be either on NT or AA level depending on annotation // NOTE: (AA tax. classification will be added only when its PR is merged - NOW - only on NT) - //TODO RUN MMSEQS/database /create db and taxonomy and converttsv / and grab teh output table - if ( params.classify_taxonomy == true ) { + if ( params.taxonomy_mmseqs_classification_off == false ) { // Download the ref db if not supplied by user - if ( params.classify_taxonomy_mmseqs_db_localpath ) { + if ( params.taxonomy_mmseqs_databases_localpath != null ) { ch_mmseqs_db = Channel .fromPath( params.taxonomy_mmseqs_databases_localpath ) .first() @@ -166,8 +165,9 @@ workflow FUNCSCAN { ch_taxonomy_querydb = MMSEQS_CREATEDB.out.db MMSEQS_TAXONOMY ( ch_taxonomy_querydb, ch_mmseqs_db ) ch_versions = ch_versions.mix(MMSEQS_TAXONOMY.out.versions) - ch_taxonomy_querydb_taxdb = MMSEQS_TAXONOMY.out.db - MMSEQS_CREATETSV ( ch_taxonomy_querydb, ch_taxonomy_querydb_taxdb, [[:],[]] ) + ch_taxonomy_querydb_taxdb = MMSEQS_TAXONOMY.out.db_taxonomy + + MMSEQS_CREATETSV ( ch_taxonomy_querydb_taxdb, [[:],[]], ch_taxonomy_querydb ) ch_versions = ch_versions.mix(MMSEQS_CREATETSV.out.versions) ch_taxonomy_tsv = MMSEQS_CREATETSV.out.tsv From 6f2c076432c43977b72ed04b30c8c93e5f788120 Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Thu, 29 Feb 2024 16:42:09 +0100 Subject: [PATCH 07/39] adjust mmseqs/createtsv step --- nextflow_schema.json | 13 +++++++------ workflows/funcscan.nf | 1 + 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 088cc41b..0aedd283 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -75,15 +75,16 @@ "default": "", "fa_icon": "fas fa-address-book", "properties": { - "classify_taxonomy": { + "taxonomy_mmseqs_classification_off": { "type": "boolean", - "default": true, - "description": "Activates the taxonomic classification of contigs", - "fa_icon": "fas fa-ad" + "fa_icon": "fas fa-ad", + "description": "Activates the taxonomic classification of input contigs." }, "taxonomy_mmseqs_save_intermedfiles": { "type": "boolean", - "fa_icon": "fas fa-file-download" + "fa_icon": "fas fa-file-download", + "description": "Save any intermediate files created in the taxonomic classification step.", + "help_text": "This flag saves to the output folder all the databases created to generate the final taxonomic lineages." } } }, @@ -101,7 +102,7 @@ }, "taxonomy_mmseqs_databases_id": { "type": "string", - "default": "SILVA", + "default": "Kalamari", "help_text": "Specify the MMseqs2 formatted database to use to classify the input contigs. This can be a nucleotide or amino acid database, however the database chosen must include taxonomic classifications. For example both GTDB, an amico acid database and SILVA, a nucleotide database are both databases provided by MMseqs2 that have taxonomic classifications. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf).\\n\\n> Modifies tool parameter(s):\\n> - mmseqs databases \".", "description": "Specify the label of the database to be used.", "fa_icon": "fas fa-address-card" diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 5dd0254a..1e27d69b 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -167,6 +167,7 @@ workflow FUNCSCAN { ch_versions = ch_versions.mix(MMSEQS_TAXONOMY.out.versions) ch_taxonomy_querydb_taxdb = MMSEQS_TAXONOMY.out.db_taxonomy + // MMSEQS_CREATETSV ( ch_taxonomy_querydb_taxdb, [[:],[]], ch_taxonomy_querydb ) MMSEQS_CREATETSV ( ch_taxonomy_querydb_taxdb, [[:],[]], ch_taxonomy_querydb ) ch_versions = ch_versions.mix(MMSEQS_CREATETSV.out.versions) ch_taxonomy_tsv = MMSEQS_CREATETSV.out.tsv From 781ae228dd9fec371d042b0f2059de4015bc40a3 Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Sun, 3 Mar 2024 23:55:38 +0100 Subject: [PATCH 08/39] add the merging step - working locally --- CHANGELOG.md | 4 + README.md | 14 +- bin/comBGC.py | 4 + bin/merge_taxonomy.py | 231 ++++++++++++++++++ docs/output.md | 16 +- modules/local/merge_taxonomy_ampcombi.nf | 32 +++ modules/local/merge_taxonomy_combgc.nf | 32 +++ modules/local/merge_taxonomy_hamronization.nf | 32 +++ subworkflows/local/amp.nf | 17 +- subworkflows/local/arg.nf | 32 ++- subworkflows/local/bgc.nf | 7 + workflows/funcscan.nf | 30 ++- 12 files changed, 415 insertions(+), 36 deletions(-) create mode 100755 bin/merge_taxonomy.py create mode 100644 modules/local/merge_taxonomy_ampcombi.nf create mode 100644 modules/local/merge_taxonomy_combgc.nf create mode 100644 modules/local/merge_taxonomy_hamronization.nf diff --git a/CHANGELOG.md b/CHANGELOG.md index b8147b46..52e348d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,9 +11,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#324](https://github.com/nf-core/funcscan/pull/324) Removed separate DeepARG test profile because database download is now stable. (by @jasmezz) - [#332](https://github.com/nf-core/funcscan/pull/332) & [#327](https://github.com/nf-core/funcscan/pull/327) Merged pipeline template of nf-core/tools version 2.12.1 (by @jfy133, @jasmezz) - [#338](https://github.com/nf-core/funcscan/pull/338) Set `--meta` parameter to default for Bakta, with singlemode optional. (by @jasmezz) +- [#343](https://github.com/nf-core/funcscan/pull/343) Added contig taxonomic classification using [MMseqs2](https://github.com/soedinglab/MMseqs2/). (by @darcy220606) ### `Fixed` +- [#343](https://github.com/nf-core/funcscan/pull/343) Standardized the resulting workflow summary tables to always start with 'sample_id\tcontig_id\t..'. (by @darcy220606) +- [#343](https://github.com/nf-core/funcscan/pull/343) Reformated the the output of 'hamronization summarize' module. (by @darcy220606) + ### `Dependencies` | Tool | Previous version | New version | diff --git a/README.md b/README.md index 61b789c8..f591f55d 100644 --- a/README.md +++ b/README.md @@ -28,13 +28,13 @@ On release, automated continuous integration tests run the pipeline on a full-si The nf-core/funcscan AWS full test dataset are contigs generated by the MGnify service from the ENA. We used contigs generated from assemblies of chicken cecum shotgun metagenomes (study accession: MGYS00005631). ## Pipeline summary - -1. Annotation of assembled prokaryotic contigs with [`Prodigal`](https://github.com/hyattpd/Prodigal), [`Pyrodigal`](https://github.com/althonos/pyrodigal), [`Prokka`](https://github.com/tseemann/prokka), or [`Bakta`](https://github.com/oschwengers/bakta) -2. Screening contigs for antimicrobial peptide-like sequences with [`ampir`](https://cran.r-project.org/web/packages/ampir/index.html), [`Macrel`](https://github.com/BigDataBiology/macrel), [`HMMER`](http://hmmer.org/), [`AMPlify`](https://github.com/bcgsc/AMPlify) -3. Screening contigs for antibiotic resistant gene-like sequences with [`ABRicate`](https://github.com/tseemann/abricate), [`AMRFinderPlus`](https://github.com/ncbi/amr), [`fARGene`](https://github.com/fannyhb/fargene), [`RGI`](https://card.mcmaster.ca/analyze/rgi), [`DeepARG`](https://bench.cs.vt.edu/deeparg) -4. Screening contigs for biosynthetic gene cluster-like sequences with [`antiSMASH`](https://antismash.secondarymetabolites.org), [`DeepBGC`](https://github.com/Merck/deepbgc), [`GECCO`](https://gecco.embl.de/), [`HMMER`](http://hmmer.org/) -5. Creating aggregated reports for all samples across the workflows with [`AMPcombi`](https://github.com/Darcy220606/AMPcombi) for AMPs, [`hAMRonization`](https://github.com/pha4ge/hAMRonization) for ARGs, and [`comBGC`](https://raw.githubusercontent.com/nf-core/funcscan/master/bin/comBGC.py) for BGCs -6. Software version and methods text reporting with [`MultiQC`](http://multiqc.info/) +1. Taxonomic classification of contigs from **of prokaryotic origin** with [`MMseqs2`](https://github.com/soedinglab/MMseqs2). +2. Annotation of assembled prokaryotic contigs with [`Prodigal`](https://github.com/hyattpd/Prodigal), [`Pyrodigal`](https://github.com/althonos/pyrodigal), [`Prokka`](https://github.com/tseemann/prokka), or [`Bakta`](https://github.com/oschwengers/bakta) +3. Screening contigs for antimicrobial peptide-like sequences with [`ampir`](https://cran.r-project.org/web/packages/ampir/index.html), [`Macrel`](https://github.com/BigDataBiology/macrel), [`HMMER`](http://hmmer.org/), [`AMPlify`](https://github.com/bcgsc/AMPlify) +4. Screening contigs for antibiotic resistant gene-like sequences with [`ABRicate`](https://github.com/tseemann/abricate), [`AMRFinderPlus`](https://github.com/ncbi/amr), [`fARGene`](https://github.com/fannyhb/fargene), [`RGI`](https://card.mcmaster.ca/analyze/rgi), [`DeepARG`](https://bench.cs.vt.edu/deeparg) +5. Screening contigs for biosynthetic gene cluster-like sequences with [`antiSMASH`](https://antismash.secondarymetabolites.org), [`DeepBGC`](https://github.com/Merck/deepbgc), [`GECCO`](https://gecco.embl.de/), [`HMMER`](http://hmmer.org/) +6. Creating aggregated reports for all samples across the workflows with [`AMPcombi`](https://github.com/Darcy220606/AMPcombi) for AMPs, [`hAMRonization`](https://github.com/pha4ge/hAMRonization) for ARGs, and [`comBGC`](https://raw.githubusercontent.com/nf-core/funcscan/master/bin/comBGC.py) for BGCs +7. Software version and methods text reporting with [`MultiQC`](http://multiqc.info/) ![funcscan metro workflow](docs/images/funcscan_metro_workflow.png) diff --git a/bin/comBGC.py b/bin/comBGC.py index a492af97..12bcff01 100755 --- a/bin/comBGC.py +++ b/bin/comBGC.py @@ -643,6 +643,10 @@ def gecco_workflow(gecco_paths): inplace=True, ) + # Rearrange and rename the columns in the summary df + summary_all = summary_all.iloc[:, [0, 2, 1] + list(range(3, len(summary_all.columns)))] + summary_all.rename(columns={'Sample_ID':'sample_id', 'Contig_ID':'contig_id', 'CDS_ID':'BGC_region_contig_ids'}, inplace=True) + # Write results to TSV if not os.path.exists(outdir): os.makedirs(outdir) diff --git a/bin/merge_taxonomy.py b/bin/merge_taxonomy.py new file mode 100755 index 00000000..6b5db97b --- /dev/null +++ b/bin/merge_taxonomy.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 + +# Author: @darcy220606 +# Date: March 2024 +# Version: 0.1.0 + +# Required modules +import sys +import os +import pandas as pd +import numpy as np +import argparse + +tool_version = "0.1.0" +######################################### +# TOP LEVEL: AMPCOMBI +######################################### +parser = argparse.ArgumentParser(prog = 'merge_taxonomy', formatter_class=argparse.RawDescriptionHelpFormatter, + usage='%(prog)s [options]', + description=('''\ + ............................................................................. + *merge_taxonomy* + ............................................................................. + This script merges all three funcscan workflows with + MMseqs2 taxonomy results. This is done in three submodules that can be + activated seperately. + .............................................................................'''), + epilog='''Thank you for running taxonomy_merge!''', + add_help=True) +parser.add_argument('--version', action='version', version='merge_taxonomy ' + tool_version) + +######################################### +# SUBPARSERS +######################################### +subparsers = parser.add_subparsers(required=True) + +######################################### +# SUBPARSERS : AMPCOMBI +######################################### +ampcombi_parser = subparsers.add_parser('ampcombi_taxa') + +ampcombi_parser.add_argument("--ampcombi", dest="amp", nargs='?', help="Enter the path to the ampcombi_complete_summary.tsv' \n (default: %(default)s)", + type=str, default='ampcombi_complete_summary.csv') +ampcombi_parser.add_argument("--taxonomy", dest="taxa1", nargs='+', help="Enter the list of taxonomy files for all samples. ") + +######################################### +# SUBPARSERS : COMBGC +######################################### +combgc_parser = subparsers.add_parser('combgc_taxa') + +combgc_parser.add_argument("--combgc", dest="bgc", nargs='?', help="Enter the path to the combgc_complete_summary.tsv' \n (default: %(default)s)", + type=str, default='combgc_complete_summary.csv') +combgc_parser.add_argument("--taxonomy", dest="taxa2", nargs='+', help="Enter the list of taxonomy files for all samples. ") + +######################################### +# SUBPARSERS : HAMRONIZATION +######################################### +hamronization_parser = subparsers.add_parser('hamronization_taxa') + +hamronization_parser.add_argument("--hamronization", dest="arg", nargs='?', help="Enter the path to the hamronization_complete_summary.tsv' \n (default: %(default)s)", + type=str, default='hamronization_complete_summary.csv') +hamronization_parser.add_argument("--taxonomy", dest="taxa3",nargs='+', help="Enter the list of taxonomy files for all samples. ") + +######################################### +# TAXONOMY +######################################### +def reformat_mmseqs_taxonomy(mmseqs_taxonomy): + mmseqs2_df = pd.read_csv(mmseqs_taxonomy, sep='\t', header=None, names=['contig_id', 'taxid', 'rank_label', 'scientific_name', 'lineage', 'mmseqs_lineage_contig']) + # remove the lineage column + mmseqs2_df.drop('lineage', axis=1, inplace=True) + mmseqs2_df['mmseqs_lineage_contig'].unique() + # convert any classification that has Eukaryota/root to NaN as funcscan targets bacteria ONLY ** + for i, row in mmseqs2_df.iterrows(): + lineage = str(row['mmseqs_lineage_contig']) + if 'Eukaryota' in lineage or 'root' in lineage: + mmseqs2_df.at[i, 'mmseqs_lineage_contig'] = np.nan + #mmseqs2_df['mmseqs_lineage_contig'].unique() + # insert the sample name in the first column according to the file basename + file_basename = os.path.basename(mmseqs_taxonomy) + filename = os.path.splitext(file_basename)[0] + mmseqs2_df.insert(0, 'sample_id', filename) + return mmseqs2_df + +######################################### +# FUNCTION : AMPCOMBI +######################################### +def ampcombi_taxa(args): + merged_df = pd.DataFrame() + + # assign input args to variables + ampcombi = args.amp + taxa_list = args.taxa1 + + # prepare the taxonomy files + taxa_df = pd.DataFrame() + # append the dfs to the taxonomy_files_combined + for file in taxa_list: # list of taxa files ['',''] + df = reformat_mmseqs_taxonomy(file) + taxa_df = pd.concat([taxa_df, df]) + + # filter the tool df + tool_df = pd.read_csv(ampcombi, sep=',') #current ampcombi version is comma sep. CHANGE WITH VERSION 0.2.0 + # make sure 1st and 2nd column have the same column labels + tool_df.rename(columns={tool_df.columns[0]: 'sample_id'}, inplace=True) + tool_df.rename(columns={tool_df.columns[1]: 'contig_id'}, inplace=True) + # grab the real contig id in another column copy for merging + tool_df['contig_id_merge'] = tool_df['contig_id'].str.rsplit('_', 1).str[0] + + # merge rows from taxa to ampcombi_df based on substring match in sample_id + # grab the unique sample names from the taxonomy table + samples_taxa = taxa_df['sample_id'].unique() + # for every sampleID in taxadf merge the results + for sampleID in samples_taxa: + # subset ampcombi + subset_tool = tool_df.loc[tool_df['sample_id'].str.contains(sampleID)] + # subset taxa + subset_taxa = taxa_df.loc[taxa_df['sample_id'].str.contains(sampleID)] + # merge + subset_df = pd.merge(subset_tool, subset_taxa, left_on = 'contig_id_merge', right_on='contig_id', how='left') + # cleanup the table + columnsremove = ['contig_id_merge','contig_id_y', 'sample_id_y'] + subset_df.drop(columnsremove, axis=1, inplace=True) + subset_df.rename(columns={'contig_id_x': 'contig_id', 'sample_id_x':'sample_id'},inplace=True) + # append in the combined_df + merged_df = merged_df.append(subset_df, ignore_index=True) + + # write to file + merged_df.to_csv('ampcombi_complete_summary_taxonomy.tsv', sep='\t', index=False) + +######################################### +# FUNCTION : COMBGC +######################################### +def combgc_taxa(args): + merged_df = pd.DataFrame() + + # assign input args to variables + combgc = args.bgc + taxa_list = args.taxa2 + + # prepare the taxonomy files + taxa_df = pd.DataFrame() + # append the dfs to the taxonomy_files_combined + for file in taxa_list: # list of taxa files ['',''] + df = reformat_mmseqs_taxonomy(file) + taxa_df = pd.concat([taxa_df, df]) + + # filter the tool df + tool_df = pd.read_csv(combgc, sep='\t') + # make sure 1st and 2nd column have the same column labels + tool_df.rename(columns={tool_df.columns[0]: 'sample_id'}, inplace=True) + tool_df.rename(columns={tool_df.columns[1]: 'contig_id'}, inplace=True) + + # merge rows from taxa to ampcombi_df based on substring match in sample_id + # grab the unique sample names from the taxonomy table + samples_taxa = taxa_df['sample_id'].unique() + # for every sampleID in taxadf merge the results + for sampleID in samples_taxa: + # subset ampcombi + subset_tool = tool_df.loc[tool_df['sample_id'].str.contains(sampleID)] + # subset taxa + subset_taxa = taxa_df.loc[taxa_df['sample_id'].str.contains(sampleID)] + # merge + subset_df = pd.merge(subset_tool, subset_taxa, left_on = 'contig_id', right_on='contig_id', how='left') + # cleanup the table + columnsremove = ['sample_id_y'] + subset_df.drop(columnsremove, axis=1, inplace=True) + subset_df.rename(columns={'sample_id_x':'sample_id'},inplace=True) + # append in the combined_df + merged_df = merged_df.append(subset_df, ignore_index=True) + + # write to file + merged_df.to_csv('combgc_complete_summary_taxonomy.tsv', sep='\t', index=False) + +######################################### +# FUNCTION : HAMRONIZATION +######################################### +def hamronization_taxa(args): + merged_df = pd.DataFrame() + + # assign input args to variables + hamronization = args.arg + taxa_list = args.taxa3 + + # prepare the taxonomy files + taxa_df = pd.DataFrame() + # append the dfs to the taxonomy_files_combined + for file in taxa_list: # list of taxa files ['',''] + df = reformat_mmseqs_taxonomy(file) + taxa_df = pd.concat([taxa_df, df]) + + # filter the tool df + tool_df = pd.read_csv(hamronization, sep='\t') + # rename the columns + tool_df.rename(columns={'input_file_name':'sample_id', 'input_sequence_id':'contig_id'}, inplace=True) + # reorder the columns + new_order = ['sample_id', 'contig_id'] + [col for col in tool_df.columns if col not in ['sample_id', 'contig_id']] + tool_df = tool_df.reindex(columns=new_order) + # grab the real contig id in another column copy for merging + tool_df['contig_id_merge'] = tool_df['contig_id'].str.rsplit('_', 1).str[0] + + # merge rows from taxa to ampcombi_df based on substring match in sample_id + # grab the unique sample names from the taxonomy table + samples_taxa = taxa_df['sample_id'].unique() + # for every sampleID in taxadf merge the results + for sampleID in samples_taxa: + # subset ampcombi + subset_tool = tool_df.loc[tool_df['sample_id'].str.contains(sampleID)] + # subset taxa + subset_taxa = taxa_df.loc[taxa_df['sample_id'].str.contains(sampleID)] + # merge + subset_df = pd.merge(subset_tool, subset_taxa, left_on = 'contig_id_merge', right_on='contig_id', how='left') + # cleanup the table + columnsremove = ['contig_id_merge','contig_id_y', 'sample_id_y'] + subset_df.drop(columnsremove, axis=1, inplace=True) + subset_df.rename(columns={'contig_id_x': 'contig_id', 'sample_id_x':'sample_id'},inplace=True) + # append in the combined_df + merged_df = merged_df.append(subset_df, ignore_index=True) + + # write to file + merged_df.to_csv('hamronization_complete_summary_taxonomy.tsv', sep='\t', index=False) + +######################################### +# SUBPARSERS : DEFAULT +######################################### +ampcombi_parser.set_defaults(func=ampcombi_taxa) +combgc_parser.set_defaults(func=combgc_taxa) +hamronization_parser.set_defaults(func=hamronization_taxa) + +if __name__ == '__main__': + args = parser.parse_args() + args.func(args) # call the default function diff --git a/docs/output.md b/docs/output.md index c04ff5fc..b47b8cc9 100644 --- a/docs/output.md +++ b/docs/output.md @@ -8,7 +8,7 @@ The output of nf-core/funcscan provides reports for each of the functional group - antimicrobial peptides (tools: [Macrel](https://github.com/BigDataBiology/macrel), [AMPlify](https://github.com/bcgsc/AMPlify), [ampir](https://ampir.marine-omics.net), [hmmsearch](http://hmmer.org) – summarised by [AMPcombi](https://github.com/Darcy220606/AMPcombi)) - biosynthetic gene clusters (tools: [antiSMASH](https://docs.antismash.secondarymetabolites.org), [DeepBGC](https://github.com/Merck/deepbgc), [GECCO](https://gecco.embl.de), [hmmsearch](http://hmmer.org) – summarised by [comBGC](#combgc)) -As a general workflow, we recommend to first look at the summary reports ([ARGs](#hamronization), [AMPs](#ampcombi), [BGCs](#combgc)), to get a general overview of what hits have been found across all the tools of each functional group. After which, you can explore the specific output directories of each tool to get more detailed information about each result. The tool-specific output directories also includes the output from the functional annotation steps of either [prokka](https://github.com/tseemann/prokka), [pyrodigal](https://github.com/althonos/pyrodigal), [prodigal](https://github.com/hyattpd/Prodigal), or [Bakta](https://github.com/oschwengers/bakta) if the `--save_annotations` flag was set. +As a general workflow, we recommend to first look at the summary reports ([ARGs](#hamronization), [AMPs](#ampcombi), [BGCs](#combgc)), to get a general overview of what hits have been found across all the tools of each functional group. After which, you can explore the specific output directories of each tool to get more detailed information about each result. The tool-specific output directories also includes the output from the functional annotation steps of either [prokka](https://github.com/tseemann/prokka), [pyrodigal](https://github.com/althonos/pyrodigal), [prodigal](https://github.com/hyattpd/Prodigal), or [Bakta](https://github.com/oschwengers/bakta) if the `--save_annotations` flag was set and taxonomic classifications from [MMseqs2](https://github.com/soedinglab/MMseqs2) if the `taxonomy_mmseqs_save_intermedfiles` flag was set. Similarly, all downloaded databases are saved (i.e. from [MMseqs2](https://github.com/soedinglab/MMseqs2), [antiSMASH](https://docs.antismash.secondarymetabolites.org), [AMRFinderPlus](https://www.ncbi.nlm.nih.gov/pathogens/antimicrobial-resistance/AMRFinder), [Bakta](https://github.com/oschwengers/bakta), [DeepARG](https://bitbucket.org/gusphdproj/deeparg-ss/src/master), and/or [AMPcombi](https://github.com/Darcy220606/AMPcombi)) into the output directory `/downloads/` if the `--save_databases` flag was set. @@ -19,7 +19,9 @@ The directories listed below will be created in the results directory (specified ```console results/ ├── taxonomy/ -| ├── mmseqs_createtsv/ +| ├── mmseqs_createdb/ +| ├── mmseqs_taxonomy/ +| └── mmseqs_createtsv/ ├── annotation/ | ├── bakta/ | ├── prodigal @@ -58,7 +60,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes p Taxonomy classification of contigs with: - - [MMseqs2](https://github.com/soedinglab/MMseqs2) (default) - for contig taxonomic classification using 2bLCA. +- [MMseqs2](https://github.com/soedinglab/MMseqs2) (default) - for contig taxonomic classification using 2bLCA. ORF prediction and annotation with any of: @@ -100,6 +102,7 @@ Output Summaries: ## Tool details ### Taxonomic classification tool + [MMseqs2](#MMseqs2)
@@ -107,9 +110,10 @@ Output Summaries: - `taxonomy/mmseqs2_createtsv/` - `/`: - - `*.tsv`: tab seperated table containing the taxonomic lineage of every contig when available - -> Descriptions taken from the [MMseqs2 documentation](https://github.com/soedinglab/MMseqs2/wiki) + - `*.tsv`: tab seperated table containing the taxonomic lineage of every contig when available. +- `reports//` +- `*_complete_summary_taxonomy.tsv`: tab seperated table containing the taxonomic lineage of every contig when available along with teh results from the summary tables. + > Descriptions taken from the [MMseqs2 documentation](https://github.com/soedinglab/MMseqs2/wiki)
diff --git a/modules/local/merge_taxonomy_ampcombi.nf b/modules/local/merge_taxonomy_ampcombi.nf new file mode 100644 index 00000000..6a7c0185 --- /dev/null +++ b/modules/local/merge_taxonomy_ampcombi.nf @@ -0,0 +1,32 @@ +process MERGE_TAXONOMY_AMPCOMBI { + label 'process_medium' + + conda "conda-forge::python=3.11.0 conda-forge::biopython=1.80 conda-forge::pandas=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-27978155697a3671f3ef9aead4b5c823a02cc0b7:548df772fe13c0232a7eab1bc1deb98b495a05ab-0' : + 'biocontainers/mulled-v2-27978155697a3671f3ef9aead4b5c823a02cc0b7:548df772fe13c0232a7eab1bc1deb98b495a05ab-0' }" + + input: + path(ampcombi_df) + path(taxa_list) + + output: + path("ampcombi_complete_summary_taxonomy.tsv") , emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: // This script is bundled with the pipeline, in nf-core/funcscan/bin/ + """ + merge_taxonomy.py \\ + ampcombi_taxa \\ + --ampcombi $ampcombi_df \\ + --taxonomy $taxa_list + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + merge_taxonomy: \$(merge_taxonomy.py --version | sed 's/merge_taxonomy //g') + END_VERSIONS + """ +} diff --git a/modules/local/merge_taxonomy_combgc.nf b/modules/local/merge_taxonomy_combgc.nf new file mode 100644 index 00000000..263d6700 --- /dev/null +++ b/modules/local/merge_taxonomy_combgc.nf @@ -0,0 +1,32 @@ +process MERGE_TAXONOMY_COMBGC { + label 'process_medium' + + conda "conda-forge::python=3.11.0 conda-forge::biopython=1.80 conda-forge::pandas=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-27978155697a3671f3ef9aead4b5c823a02cc0b7:548df772fe13c0232a7eab1bc1deb98b495a05ab-0' : + 'biocontainers/mulled-v2-27978155697a3671f3ef9aead4b5c823a02cc0b7:548df772fe13c0232a7eab1bc1deb98b495a05ab-0' }" + + input: + path(combgc_df) + path(taxa_list) + + output: + path("combgc_complete_summary_taxonomy.tsv") , emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: // This script is bundled with the pipeline, in nf-core/funcscan/bin/ + """ + merge_taxonomy.py \\ + combgc_taxa \\ + --combgc $combgc_df \\ + --taxonomy $taxa_list + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + merge_taxonomy: \$(merge_taxonomy.py --version | sed 's/merge_taxonomy //g') + END_VERSIONS + """ +} diff --git a/modules/local/merge_taxonomy_hamronization.nf b/modules/local/merge_taxonomy_hamronization.nf new file mode 100644 index 00000000..97d066ff --- /dev/null +++ b/modules/local/merge_taxonomy_hamronization.nf @@ -0,0 +1,32 @@ +process MERGE_TAXONOMY_HAMRONIZATION { + label 'process_medium' + + conda "conda-forge::python=3.11.0 conda-forge::biopython=1.80 conda-forge::pandas=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-27978155697a3671f3ef9aead4b5c823a02cc0b7:548df772fe13c0232a7eab1bc1deb98b495a05ab-0' : + 'biocontainers/mulled-v2-27978155697a3671f3ef9aead4b5c823a02cc0b7:548df772fe13c0232a7eab1bc1deb98b495a05ab-0' }" + + input: + path(hamronization_df) + path(taxa_list) + + output: + path("hamronization_complete_summary_taxonomy.tsv") , emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: // This script is bundled with the pipeline, in nf-core/funcscan/bin/ + """ + merge_taxonomy.py \\ + hamronization_taxa \\ + --hamronization $hamronization_df \\ + --taxonomy $taxa_list + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + merge_taxonomy: \$(merge_taxonomy.py --version | sed 's/merge_taxonomy //g') + END_VERSIONS + """ +} diff --git a/subworkflows/local/amp.nf b/subworkflows/local/amp.nf index 1e641893..c70c472b 100644 --- a/subworkflows/local/amp.nf +++ b/subworkflows/local/amp.nf @@ -10,11 +10,13 @@ include { DRAMP_DOWNLOAD } from '../. include { AMPCOMBI } from '../../modules/nf-core/ampcombi/main' include { GUNZIP as GUNZIP_MACREL_PRED ; GUNZIP as GUNZIP_HMMER ; GUNZIP as GUNZIP_MACREL_ORFS } from '../../modules/nf-core/gunzip/main' include { TABIX_BGZIP } from '../../modules/nf-core/tabix/bgzip/main' +include { MERGE_TAXONOMY_AMPCOMBI } from '../../modules/local/merge_taxonomy_ampcombi' workflow AMP { take: contigs // tuple val(meta), path(contigs) faa // tuple val(meta), path(PROKKA/PRODIGAL.out.faa) + tsv // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) main: ch_versions = Channel.empty() @@ -101,17 +103,16 @@ workflow AMP { } AMPCOMBI( ch_input_for_ampcombi.input, ch_input_for_ampcombi.faa, ch_ampcombi_input_db ) - ch_ampcombi_summaries = ch_ampcombi_summaries.mix(AMPCOMBI.out.csv) //AMPCOMBI concatenation - ch_ampcombi_summaries_out = ch_ampcombi_summaries - .multiMap{ - input: [ it[0] ] - summary: it[1] - } + ch_ampcombi_summaries = AMPCOMBI.out.csv.map{ it[1] }.collectFile(name: 'ampcombi_complete_summary.tsv', keepHeader:true) + + // MERGE_TAXONOMY + ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() + MERGE_TAXONOMY_AMPCOMBI(ch_ampcombi_summaries, ch_mmseqs_taxonomy_list) - ch_tabix_input = Channel.of(['id':'ampcombi_complete_summary']) - .combine(ch_ampcombi_summaries_out.summary.collectFile(name: 'ampcombi_complete_summary.csv', keepHeader:true)) + ch_tabix_input = Channel.of(['id':'ampcombi_complete_summary_taxonomy']) + .combine(MERGE_TAXONOMY_AMPCOMBI.out.tsv) TABIX_BGZIP(ch_tabix_input) diff --git a/subworkflows/local/arg.nf b/subworkflows/local/arg.nf index 17945a7c..6b9ddf1f 100644 --- a/subworkflows/local/arg.nf +++ b/subworkflows/local/arg.nf @@ -2,24 +2,26 @@ Run ARG screening tools */ -include { ABRICATE_RUN } from '../../modules/nf-core/abricate/run/main' -include { AMRFINDERPLUS_UPDATE } from '../../modules/nf-core/amrfinderplus/update/main' -include { AMRFINDERPLUS_RUN } from '../../modules/nf-core/amrfinderplus/run/main' -include { FARGENE } from '../../modules/nf-core/fargene/main' -include { DEEPARG_DOWNLOADDATA } from '../../modules/nf-core/deeparg/downloaddata/main' -include { DEEPARG_PREDICT } from '../../modules/nf-core/deeparg/predict/main' -include { RGI_MAIN } from '../../modules/nf-core/rgi/main/main' -include { HAMRONIZATION_ABRICATE } from '../../modules/nf-core/hamronization/abricate/main' -include { HAMRONIZATION_RGI } from '../../modules/nf-core/hamronization/rgi/main' -include { HAMRONIZATION_DEEPARG } from '../../modules/nf-core/hamronization/deeparg/main' -include { HAMRONIZATION_AMRFINDERPLUS } from '../../modules/nf-core/hamronization/amrfinderplus/main' -include { HAMRONIZATION_FARGENE } from '../../modules/nf-core/hamronization/fargene/main' -include { HAMRONIZATION_SUMMARIZE } from '../../modules/nf-core/hamronization/summarize/main' +include { ABRICATE_RUN } from '../../modules/nf-core/abricate/run/main' +include { AMRFINDERPLUS_UPDATE } from '../../modules/nf-core/amrfinderplus/update/main' +include { AMRFINDERPLUS_RUN } from '../../modules/nf-core/amrfinderplus/run/main' +include { FARGENE } from '../../modules/nf-core/fargene/main' +include { DEEPARG_DOWNLOADDATA } from '../../modules/nf-core/deeparg/downloaddata/main' +include { DEEPARG_PREDICT } from '../../modules/nf-core/deeparg/predict/main' +include { RGI_MAIN } from '../../modules/nf-core/rgi/main/main' +include { HAMRONIZATION_ABRICATE } from '../../modules/nf-core/hamronization/abricate/main' +include { HAMRONIZATION_RGI } from '../../modules/nf-core/hamronization/rgi/main' +include { HAMRONIZATION_DEEPARG } from '../../modules/nf-core/hamronization/deeparg/main' +include { HAMRONIZATION_AMRFINDERPLUS } from '../../modules/nf-core/hamronization/amrfinderplus/main' +include { HAMRONIZATION_FARGENE } from '../../modules/nf-core/hamronization/fargene/main' +include { HAMRONIZATION_SUMMARIZE } from '../../modules/nf-core/hamronization/summarize/main' +include { MERGE_TAXONOMY_HAMRONIZATION } from '../../modules/local/merge_taxonomy_hamronization' workflow ARG { take: contigs // tuple val(meta), path(contigs) annotations // output from prokka + tsv // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) main: ch_versions = Channel.empty() @@ -145,6 +147,10 @@ workflow ARG { HAMRONIZATION_SUMMARIZE( ch_input_for_hamronization_summarize, params.arg_hamronization_summarizeformat ) ch_versions = ch_versions.mix(HAMRONIZATION_SUMMARIZE.out.versions) + // MERGE_TAXONOMY + ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() + MERGE_TAXONOMY_HAMRONIZATION(HAMRONIZATION_SUMMARIZE.out.tsv, ch_mmseqs_taxonomy_list) + emit: versions = ch_versions } diff --git a/subworkflows/local/bgc.nf b/subworkflows/local/bgc.nf index 22074d16..5d5fbd9e 100644 --- a/subworkflows/local/bgc.nf +++ b/subworkflows/local/bgc.nf @@ -12,6 +12,7 @@ include { HMMER_HMMSEARCH as BGC_HMMER_HMMSEARCH } from '../../modules/nf-core include { DEEPBGC_DOWNLOAD } from '../../modules/nf-core/deepbgc/download/main' include { DEEPBGC_PIPELINE } from '../../modules/nf-core/deepbgc/pipeline/main' include { COMBGC } from '../../modules/local/combgc' +include { MERGE_TAXONOMY_COMBGC } from '../../modules/local/merge_taxonomy_combgc' workflow BGC { @@ -20,6 +21,7 @@ workflow BGC { gff // tuple val(meta), path(.out.gff) faa // tuple val(meta), path(.out.faa) gbk // tuple val(meta), path(.out.gbk) + tsv // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) main: ch_versions = Channel.empty() @@ -183,8 +185,13 @@ workflow BGC { // COMBGC COMBGC ( ch_bgcresults_for_combgc ) + // COMBGC concatenation ch_combgc_summaries = COMBGC.out.tsv.map{ it[1] }.collectFile(name: 'combgc_complete_summary.tsv', storeDir: "${params.outdir}/reports/combgc", keepHeader:true) + // MERGE_TAXONOMY + ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() + MERGE_TAXONOMY_COMBGC(ch_combgc_summaries, ch_mmseqs_taxonomy_list) + emit: versions = ch_versions } diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 1e27d69b..4a499cbc 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -267,6 +267,12 @@ workflow FUNCSCAN { meta, file -> if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() + }, + ch_taxonomy_tsv + .filter { + meta, file -> + if ( file.isEmpty() ) log.warn("Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") + !file.isEmpty() } ) ch_versions = ch_versions.mix(AMP.out.versions) @@ -277,7 +283,15 @@ workflow FUNCSCAN { */ if ( params.run_arg_screening ) { if (params.arg_skip_deeparg) { - ARG ( ch_prepped_input, [] ) + ARG ( ch_prepped_input, + [], + ch_taxonomy_tsv + .filter { + meta, file -> + if ( file.isEmpty() ) log.warn("Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") + !file.isEmpty() + } + ) } else { ARG ( ch_prepped_input, @@ -286,7 +300,13 @@ workflow FUNCSCAN { meta, file -> if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() - } + }, + ch_taxonomy_tsv + .filter { + meta, file -> + if ( file.isEmpty() ) log.warn("Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") + !file.isEmpty() + } ) } ch_versions = ch_versions.mix(ARG.out.versions) @@ -315,6 +335,12 @@ workflow FUNCSCAN { meta, file -> if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty GBK file. AMP screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() + }, + ch_taxonomy_tsv + .filter { + meta, file -> + if ( file.isEmpty() ) log.warn("Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") + !file.isEmpty() } ) ch_versions = ch_versions.mix(BGC.out.versions) From 71e0151548349a00dcc7dbbfb9c6516a0ce9c5b5 Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Tue, 12 Mar 2024 23:17:30 +0100 Subject: [PATCH 09/39] add nf-tests for the nf-core modules --- .gitignore | 1 - conf/modules.config | 25 ++ docs/output.md | 12 +- modules.json | 2 +- .../mmseqs/createdb/tests/main.nf.test | 58 ++++ .../mmseqs/createdb/tests/main.nf.test.snap | 47 ++++ .../nf-core/mmseqs/createdb/tests/tags.yml | 2 + modules/nf-core/mmseqs/createtsv/main.nf | 8 +- .../createtsv/tests/cluster.nextflow.config | 6 + .../mmseqs/createtsv/tests/main.nf.test | 248 ++++++++++++++++++ .../mmseqs/createtsv/tests/main.nf.test.snap | 137 ++++++++++ .../nf-core/mmseqs/createtsv/tests/tags.yml | 2 + .../createtsv/tests/taxonomy.nextflow.config | 7 + .../mmseqs/taxonomy/tests/main.nf.test | 81 ++++++ .../mmseqs/taxonomy/tests/main.nf.test.snap | 44 ++++ .../mmseqs/taxonomy/tests/nextflow.config | 5 + .../nf-core/mmseqs/taxonomy/tests/tags.yml | 2 + 17 files changed, 675 insertions(+), 12 deletions(-) create mode 100644 modules/nf-core/mmseqs/createdb/tests/main.nf.test create mode 100644 modules/nf-core/mmseqs/createdb/tests/main.nf.test.snap create mode 100644 modules/nf-core/mmseqs/createdb/tests/tags.yml create mode 100644 modules/nf-core/mmseqs/createtsv/tests/cluster.nextflow.config create mode 100644 modules/nf-core/mmseqs/createtsv/tests/main.nf.test create mode 100644 modules/nf-core/mmseqs/createtsv/tests/main.nf.test.snap create mode 100644 modules/nf-core/mmseqs/createtsv/tests/tags.yml create mode 100644 modules/nf-core/mmseqs/createtsv/tests/taxonomy.nextflow.config create mode 100644 modules/nf-core/mmseqs/taxonomy/tests/main.nf.test create mode 100644 modules/nf-core/mmseqs/taxonomy/tests/main.nf.test.snap create mode 100644 modules/nf-core/mmseqs/taxonomy/tests/nextflow.config create mode 100644 modules/nf-core/mmseqs/taxonomy/tests/tags.yml diff --git a/.gitignore b/.gitignore index ced46a87..5124c9ac 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,3 @@ results/ testing/ testing* *.pyc -tests/ diff --git a/conf/modules.config b/conf/modules.config index 3cb9826f..6a2638f6 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -96,6 +96,7 @@ process { publishDir = [ path: { "${params.outdir}/taxonomy/mmseqs_createtsv/${meta.id}" }, mode: params.publish_dir_mode, + enabled: params.taxonomy_mmseqs_save_intermedfiles, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } @@ -510,6 +511,14 @@ process { ] } + withName: MERGE_TAXONOMY_HAMRONIZATION { + publishDir = [ + path: { "${params.outdir}/reports/hamronization_summarize" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: AMPCOMBI { publishDir = [ path: { "${params.outdir}/reports/ampcombi" }, @@ -521,6 +530,14 @@ process { ext.args = "--tooldict '${ext.tooldict}' --cutoff ${params.amp_ampcombi_cutoff}" } + withName: MERGE_TAXONOMY_AMPCOMBI { + publishDir = [ + path: { "${params.outdir}/reports/ampcombi" }, + mode: params.taxonomy_mmseqs_save_intermedfiles, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: TABIX_BGZIP { publishDir = [ path: { "${params.outdir}/reports/ampcombi" }, @@ -537,6 +554,14 @@ process { ] } + withName: MERGE_TAXONOMY_COMBGC { + publishDir = [ + path: { "${params.outdir}/reports/combgc" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: DRAMP_DOWNLOAD { publishDir = [ path: { "${params.outdir}/databases/dramp" }, diff --git a/docs/output.md b/docs/output.md index b47b8cc9..e8fa59fe 100644 --- a/docs/output.md +++ b/docs/output.md @@ -111,8 +111,7 @@ Output Summaries: - `taxonomy/mmseqs2_createtsv/` - `/`: - `*.tsv`: tab seperated table containing the taxonomic lineage of every contig when available. -- `reports//` -- `*_complete_summary_taxonomy.tsv`: tab seperated table containing the taxonomic lineage of every contig when available along with teh results from the summary tables. +- `reports//_complete_summary_taxonomy.tsv`: tab seperated table containing the taxonomic lineage of every contig when available along with teh results from the summary tables. > Descriptions taken from the [MMseqs2 documentation](https://github.com/soedinglab/MMseqs2/wiki) @@ -429,11 +428,12 @@ Output Summaries: Output files - `ampcombi/` - - `ampcombi_complete_summary.csv.gz`: summarised output from all AMP workflow tools (except hmmer_hmmsearch) in compressed csv format + - `ampcombi_complete_summary_taxonomy.tsv`: summarised output from all AMP workflow tools with taxonomic assignment in tsv format + - `ampcombi_complete_summary_taxonomy.tsv.gz`: summarised output from all AMP workflow tools with taxonomic assignment in compressed tsv format - `ampcombi.log`: a log file generated by ampcombi - - `*_ampcombi.csv`: summarised output in csv for each sample - - `*_amp.faa*`: fasta file containing the amino acid sequences for all AMP hits for each sample - - `*_diamond_matches.txt*`: alignment file generated by DIAMOND for each sample + - `/*_ampcombi.csv`: summarised output in csv for each sample + - `/*_amp.faa*`: fasta file containing the amino acid sequences for all AMP hits for each sample + - `/*_diamond_matches.txt*`: alignment file generated by DIAMOND for each sample diff --git a/modules.json b/modules.json index fc89ae2c..fb5f05a7 100644 --- a/modules.json +++ b/modules.json @@ -148,7 +148,7 @@ }, "mmseqs/createtsv": { "branch": "master", - "git_sha": "151460db852d636979d9ff3ee631e2268060d4c3", + "git_sha": "5d849d54f06174c3313eb50c776d4916912db16b", "installed_by": ["modules"] }, "mmseqs/databases": { diff --git a/modules/nf-core/mmseqs/createdb/tests/main.nf.test b/modules/nf-core/mmseqs/createdb/tests/main.nf.test new file mode 100644 index 00000000..60d73419 --- /dev/null +++ b/modules/nf-core/mmseqs/createdb/tests/main.nf.test @@ -0,0 +1,58 @@ +nextflow_process { + + name "Test Process MMSEQS_CREATEDB" + script "../main.nf" + process "MMSEQS_CREATEDB" + tag "modules" + tag "modules_nfcore" + tag "mmseqs" + tag "mmseqs/createdb" + + test("Should build an mmseqs db from a contigs fasta file") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['contigs_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.db).match("contig_db") }, + { assert process.out.versions } + ) + } + + } + + test("Should build an mmseqs db from a zipped amino acid sequence file") { + + when { + process { + """ + + input[0] = [ + [ id:'test' ], + file(params.test_data['sarscov2']['genome']['proteome_fasta_gz'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.db).match("protein_gz_db") }, + { assert process.out.versions } + ) + } + + } + +} diff --git a/modules/nf-core/mmseqs/createdb/tests/main.nf.test.snap b/modules/nf-core/mmseqs/createdb/tests/main.nf.test.snap new file mode 100644 index 00000000..0c600c1f --- /dev/null +++ b/modules/nf-core/mmseqs/createdb/tests/main.nf.test.snap @@ -0,0 +1,47 @@ +{ + "protein_gz_db": { + "content": [ + [ + [ + { + "id": "test" + }, + [ + "test:md5,4b494965ed7ab67da8ca3f39523eb104", + "test.dbtype:md5,152afd7bf4dbe26f85032eee0269201a", + "test.index:md5,46f9d884e9a7f442fe1cd2ce339734e3", + "test.lookup:md5,3e27cb93d9ee875ad42a6f32f5651bdc", + "test.source:md5,eaa64fc8a5f7ec1ee49b0dcbd1a72e9d", + "test_h:md5,6e798b81c70d191f78939c2dd6223a7f", + "test_h.dbtype:md5,8895d3d8e9322aedbf45249dfb3ddb0a", + "test_h.index:md5,d5ac49ff56df064b980fa0eb5da57673" + ] + ] + ] + ], + "timestamp": "2023-11-21T12:10:12.018974702" + }, + "contig_db": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test:md5,7c3c2c5926cf8fa82e66b9628f680256", + "test.dbtype:md5,c8ed20c23ba91f4577f84c940c86c7db", + "test.index:md5,5b2fd8abd0ad3fee24738af7082e6a6e", + "test.lookup:md5,32f88756dbcb6aaf7b239b0d61730f1b", + "test.source:md5,9ada5b3ea6e1a7e16c4418eb98ae8d9d", + "test_h:md5,8c29f5ed94d83d7115e9c8a883ce358d", + "test_h.dbtype:md5,8895d3d8e9322aedbf45249dfb3ddb0a", + "test_h.index:md5,87c7c8c6d16018ebfaa6f408391a5ae2" + ] + ] + ] + ], + "timestamp": "2023-11-21T12:10:04.7348329" + } +} \ No newline at end of file diff --git a/modules/nf-core/mmseqs/createdb/tests/tags.yml b/modules/nf-core/mmseqs/createdb/tests/tags.yml new file mode 100644 index 00000000..1f511ab0 --- /dev/null +++ b/modules/nf-core/mmseqs/createdb/tests/tags.yml @@ -0,0 +1,2 @@ +mmseqs/createdb: + - modules/nf-core/mmseqs/createdb/** diff --git a/modules/nf-core/mmseqs/createtsv/main.nf b/modules/nf-core/mmseqs/createtsv/main.nf index ee58b10f..dcd4c13d 100644 --- a/modules/nf-core/mmseqs/createtsv/main.nf +++ b/modules/nf-core/mmseqs/createtsv/main.nf @@ -22,11 +22,11 @@ process MMSEQS_CREATETSV { script: def args = task.ext.args ?: '' - def args2 = task.ext.args ?: "*.dbtype" - def args3 = task.ext.args ?: "*.dbtype" - def args4 = task.ext.args ?: "*.dbtype" + def args2 = task.ext.args ?: "*.dbtype" // database generated by mmyseqs cluster | search | taxonomy | ... + def args3 = task.ext.args ?: "*.dbtype" // database generated by mmyseqs/createdb + def args4 = task.ext.args ?: "*.dbtype" // database generated by mmyseqs/createdb def prefix = task.ext.prefix ?: "${meta.id}" - db_target = db_target ?: "${db_query}" // optional argument db_target as in many cases, it's the same as db_query + """ # Extract files with specified args based suffix | remove suffix | isolate longest common substring of files DB_RESULT_PATH_NAME=\$(find -L "$db_result/" -maxdepth 1 -name "$args2" | sed 's/\\.[^.]*\$//' | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' ) diff --git a/modules/nf-core/mmseqs/createtsv/tests/cluster.nextflow.config b/modules/nf-core/mmseqs/createtsv/tests/cluster.nextflow.config new file mode 100644 index 00000000..48fee164 --- /dev/null +++ b/modules/nf-core/mmseqs/createtsv/tests/cluster.nextflow.config @@ -0,0 +1,6 @@ +process { + + withName: MMSEQS_CREATETSV { + ext.args2 = '*_clu.dbtype' + } +} diff --git a/modules/nf-core/mmseqs/createtsv/tests/main.nf.test b/modules/nf-core/mmseqs/createtsv/tests/main.nf.test new file mode 100644 index 00000000..99e79e0c --- /dev/null +++ b/modules/nf-core/mmseqs/createtsv/tests/main.nf.test @@ -0,0 +1,248 @@ +nextflow_process { + + name "Test Process MMSEQS_CREATETSV" + script "../main.nf" + process "MMSEQS_CREATETSV" + + tag "modules" + tag "modules_nfcore" + tag "mmseqs" + tag "mmseqs/taxonomy" + tag "mmseqs/createdb" + tag "mmseqs/databases" + tag "untar" + tag "mmseqs/createtsv" + + test("mmseqs/createtsv - bacteroides_fragilis - taxonomy") { + + config "./taxonomy.nextflow.config" + + setup { + run("MMSEQS_CREATEDB", alias: "MMSEQS_TAXA") { + script "../../createdb/main.nf" + process { + """ + input[0] = [ + [ id:'test_query', single_end:false ], + file(params.test_data['bacteroides_fragilis']['genome']['genome_fna_gz'], checkIfExists: true) + ] + """ + } + } + run("MMSEQS_DATABASES") { + script "../../databases/main.nf" + process { + """ + input[0] = 'SILVA' + """ + } + } + run("MMSEQS_TAXONOMY") { + script "../../taxonomy/main.nf" + process { + """ + input[0] = MMSEQS_TAXA.out.db + input[1] = MMSEQS_DATABASES.out.database + """ + } + } + } + when { + process { + """ + input[0] = MMSEQS_TAXONOMY.out.db_taxonomy + input[1] = [[:],[]] + input[2] = MMSEQS_TAXA.out.db + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("mmseqs/createtsv - sarscov2 - cluster") { + + config "./cluster.nextflow.config" + + setup { + run("UNTAR", alias: "UNTAR_QUERY") { + script "../../../untar/main.nf" + process { + """ + input[0] = [ + [ id:'test_query', single_end:true ], + file(params.test_data['sarscov2']['genome']['mmseqs_tar_gz'], checkIfExists: true), + ] + """ + } + } + run("UNTAR", alias: "UNTAR_TARGET") { + script "../../../untar/main.nf" + process { + """ + input[0] = [ + [ id:'test_target', single_end:true ], + file(params.test_data['sarscov2']['genome']['mmseqs_tar_gz'], checkIfExists: true), + ] + """ + } + } + run("UNTAR", alias: "UNTAR_RESULT") { + script "../../../untar/main.nf" + process { + """ + input[0] = [ + [ id:'test_result', single_end:true ], + file(params.test_data['sarscov2']['genome']['mmseqs_tar_gz'], checkIfExists: true), + ] + """ + } + } + } + + when { + + process { + """ + ch_query = UNTAR_QUERY.out.untar + ch_target = UNTAR_TARGET.out.untar + ch_result = UNTAR_RESULT.out.untar + + input[0] = ch_result + input[1] = ch_query + input[2] = ch_target + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.tsv).match("tsv") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("mmseqs/createtsv - bacteroides_fragilis - taxonomy - stub") { + + options "-stub" + config "./taxonomy.nextflow.config" + + setup { + run("MMSEQS_CREATEDB", alias: "MMSEQS_TAXA") { + script "../../createdb/main.nf" + process { + """ + input[0] = [ + [ id:'test_query', single_end:false ], + file(params.test_data['bacteroides_fragilis']['genome']['genome_fna_gz'], checkIfExists: true) + ] + """ + } + } + run("MMSEQS_DATABASES") { + script "../../databases/main.nf" + process { + """ + input[0] = 'SILVA' + """ + } + } + run("MMSEQS_TAXONOMY") { + script "../../taxonomy/main.nf" + process { + """ + input[0] = MMSEQS_TAXA.out.db + input[1] = MMSEQS_DATABASES.out.database + """ + } + } + } + when { + process { + """ + input[0] = MMSEQS_TAXONOMY.out.db_taxonomy + input[1] = [[:],[]] + input[2] = MMSEQS_TAXA.out.db + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("mmseqs/createtsv - sarscov2 - cluster - stub") { + + options "-stub" + config "./cluster.nextflow.config" + + setup { + run("UNTAR", alias: "UNTAR_QUERY") { + script "../../../untar/main.nf" + process { + """ + input[0] = [ + [ id:'test_query', single_end:true ], + file(params.test_data['sarscov2']['genome']['mmseqs_tar_gz'], checkIfExists: true), + ] + """ + } + } + run("UNTAR", alias: "UNTAR_TARGET") { + script "../../../untar/main.nf" + process { + """ + input[0] = [ + [ id:'test_target', single_end:true ], + file(params.test_data['sarscov2']['genome']['mmseqs_tar_gz'], checkIfExists: true), + ] + """ + } + } + run("UNTAR", alias: "UNTAR_RESULT") { + script "../../../untar/main.nf" + process { + """ + input[0] = [ + [ id:'test_result', single_end:true ], + file(params.test_data['sarscov2']['genome']['mmseqs_tar_gz'], checkIfExists: true), + ] + """ + } + } + } + + when { + + process { + """ + ch_query = UNTAR_QUERY.out.untar + ch_target = UNTAR_TARGET.out.untar + ch_result = UNTAR_RESULT.out.untar + + input[0] = ch_result + input[1] = ch_query + input[2] = ch_target + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} \ No newline at end of file diff --git a/modules/nf-core/mmseqs/createtsv/tests/main.nf.test.snap b/modules/nf-core/mmseqs/createtsv/tests/main.nf.test.snap new file mode 100644 index 00000000..5b8f9569 --- /dev/null +++ b/modules/nf-core/mmseqs/createtsv/tests/main.nf.test.snap @@ -0,0 +1,137 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,20a853f50c920d431e5ab7593ca79e6f" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-12T11:53:02.392516336" + }, + "tsv": { + "content": [ + [ + [ + { + "id": "test_result", + "single_end": true + }, + "test_result.tsv:md5,4e7ba50ce2879660dc6595286bf0d097" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-12T11:53:02.311022721" + }, + "mmseqs/createtsv - bacteroides_fragilis - taxonomy - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test_query", + "single_end": false + }, + "test_query.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,20a853f50c920d431e5ab7593ca79e6f" + ], + "tsv": [ + [ + { + "id": "test_query", + "single_end": false + }, + "test_query.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,20a853f50c920d431e5ab7593ca79e6f" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-12T11:53:11.715695614" + }, + "mmseqs/createtsv - sarscov2 - cluster - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test_result", + "single_end": true + }, + "test_result.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,20a853f50c920d431e5ab7593ca79e6f" + ], + "tsv": [ + [ + { + "id": "test_result", + "single_end": true + }, + "test_result.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,20a853f50c920d431e5ab7593ca79e6f" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-12T11:53:21.386186911" + }, + "mmseqs/createtsv - bacteroides_fragilis - taxonomy": { + "content": [ + { + "0": [ + [ + { + "id": "test_query", + "single_end": false + }, + "test_query.tsv:md5,9179f5c85b8b87a4dc998c9d17840161" + ] + ], + "1": [ + "versions.yml:md5,20a853f50c920d431e5ab7593ca79e6f" + ], + "tsv": [ + [ + { + "id": "test_query", + "single_end": false + }, + "test_query.tsv:md5,9179f5c85b8b87a4dc998c9d17840161" + ] + ], + "versions": [ + "versions.yml:md5,20a853f50c920d431e5ab7593ca79e6f" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-12T11:52:51.792345007" + } +} diff --git a/modules/nf-core/mmseqs/createtsv/tests/tags.yml b/modules/nf-core/mmseqs/createtsv/tests/tags.yml new file mode 100644 index 00000000..e27827f5 --- /dev/null +++ b/modules/nf-core/mmseqs/createtsv/tests/tags.yml @@ -0,0 +1,2 @@ +mmseqs/createtsv: + - "modules/nf-core/mmseqs/createtsv/**" diff --git a/modules/nf-core/mmseqs/createtsv/tests/taxonomy.nextflow.config b/modules/nf-core/mmseqs/createtsv/tests/taxonomy.nextflow.config new file mode 100644 index 00000000..f08205d1 --- /dev/null +++ b/modules/nf-core/mmseqs/createtsv/tests/taxonomy.nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: MMSEQS_TAXONOMY { + ext.args = '--search-type 2' + } + +} diff --git a/modules/nf-core/mmseqs/taxonomy/tests/main.nf.test b/modules/nf-core/mmseqs/taxonomy/tests/main.nf.test new file mode 100644 index 00000000..90b356ae --- /dev/null +++ b/modules/nf-core/mmseqs/taxonomy/tests/main.nf.test @@ -0,0 +1,81 @@ +nextflow_process { + + name "Test Process MMSEQS_TAXONOMY" + script "../main.nf" + config "./nextflow.config" + process "MMSEQS_TAXONOMY" + + tag "modules" + tag "modules_nfcore" + tag "mmseqs" + tag "mmseqs/taxonomy" + tag "mmseqs/createdb" + tag "mmseqs/databases" + + setup { + run("MMSEQS_CREATEDB") { + script "modules/nf-core/mmseqs/createdb/main.nf" + process { + """ + input[0] = [ + [ id:'test_query', single_end:false ], + file(params.test_data['bacteroides_fragilis']['genome']['genome_fna_gz'], checkIfExists: true) + ] + """ + } + } + + run("MMSEQS_DATABASES") { + script "modules/nf-core/mmseqs/databases/main.nf" + process { + """ + input[0] = 'SILVA' + """ + } + } + } + + test("mmseqs/taxonomy - bacteroides_fragilis - genome_nt") { + when { + process { + """ + input[0] = MMSEQS_CREATEDB.out.db + input[1] = MMSEQS_DATABASES.out.database + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.db_taxonomy.get(0).get(1)).list().sort() ).match()}, + { assert process.out.versions } + ) + } + } + + test("mmseqs/taxonomy - bacteroides_fragilis - genome_nt - stub") { + + options "-stub" + + when { + process { + """ + input[0] = MMSEQS_CREATEDB.out.db + input[1] = MMSEQS_DATABASES.out.database + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.db_taxonomy.get(0).get(1)).list().sort() ).match()}, + { assert process.out.versions } + ) + } + } +} + diff --git a/modules/nf-core/mmseqs/taxonomy/tests/main.nf.test.snap b/modules/nf-core/mmseqs/taxonomy/tests/main.nf.test.snap new file mode 100644 index 00000000..d1b57c05 --- /dev/null +++ b/modules/nf-core/mmseqs/taxonomy/tests/main.nf.test.snap @@ -0,0 +1,44 @@ +{ + "mmseqs/taxonomy - bacteroides_fragilis - genome_nt": { + "content": [ + "test_query.0", + "test_query.1", + "test_query.dbtype", + "test_query.index" + ], + "timestamp": "2024-02-26T16:35:10.953102408" + }, + "mmseqs/taxonomy - bacteroides_fragilis - genome_nt - stub": { + "content": [ + "test_query.0", + "test_query.1", + "test_query.10", + "test_query.11", + "test_query.12", + "test_query.13", + "test_query.14", + "test_query.15", + "test_query.16", + "test_query.17", + "test_query.18", + "test_query.19", + "test_query.2", + "test_query.20", + "test_query.21", + "test_query.22", + "test_query.23", + "test_query.24", + "test_query.25", + "test_query.3", + "test_query.4", + "test_query.5", + "test_query.6", + "test_query.7", + "test_query.8", + "test_query.9", + "test_query.dbtype", + "test_query.index" + ], + "timestamp": "2024-02-26T16:35:20.111282029" + } +} \ No newline at end of file diff --git a/modules/nf-core/mmseqs/taxonomy/tests/nextflow.config b/modules/nf-core/mmseqs/taxonomy/tests/nextflow.config new file mode 100644 index 00000000..72f6fc81 --- /dev/null +++ b/modules/nf-core/mmseqs/taxonomy/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: MMSEQS_TAXONOMY { + ext.args = '--search-type 2' + } +} diff --git a/modules/nf-core/mmseqs/taxonomy/tests/tags.yml b/modules/nf-core/mmseqs/taxonomy/tests/tags.yml new file mode 100644 index 00000000..76172197 --- /dev/null +++ b/modules/nf-core/mmseqs/taxonomy/tests/tags.yml @@ -0,0 +1,2 @@ +mmseqs/taxonomy: + - "modules/nf-core/mmseqs/taxonomy/**" From b7623aeb22b9168099b4b8e7a802b072833a59b5 Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Tue, 12 Mar 2024 23:48:58 +0100 Subject: [PATCH 10/39] prettier --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index f591f55d..3119ca3b 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ On release, automated continuous integration tests run the pipeline on a full-si The nf-core/funcscan AWS full test dataset are contigs generated by the MGnify service from the ENA. We used contigs generated from assemblies of chicken cecum shotgun metagenomes (study accession: MGYS00005631). ## Pipeline summary + 1. Taxonomic classification of contigs from **of prokaryotic origin** with [`MMseqs2`](https://github.com/soedinglab/MMseqs2). 2. Annotation of assembled prokaryotic contigs with [`Prodigal`](https://github.com/hyattpd/Prodigal), [`Pyrodigal`](https://github.com/althonos/pyrodigal), [`Prokka`](https://github.com/tseemann/prokka), or [`Bakta`](https://github.com/oschwengers/bakta) 3. Screening contigs for antimicrobial peptide-like sequences with [`ampir`](https://cran.r-project.org/web/packages/ampir/index.html), [`Macrel`](https://github.com/BigDataBiology/macrel), [`HMMER`](http://hmmer.org/), [`AMPlify`](https://github.com/bcgsc/AMPlify) From 035ea16183585486057a57db435d4baecb5cfde0 Mon Sep 17 00:00:00 2001 From: Anan Ibrahim <81744003+Darcy220606@users.noreply.github.com> Date: Tue, 12 Mar 2024 23:52:38 +0100 Subject: [PATCH 11/39] Update nextflow_schema.json Co-authored-by: Jasmin Frangenberg <73216762+jasmezz@users.noreply.github.com> --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 0aedd283..7b16f43a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -372,7 +372,7 @@ "default": "Bacteria", "fa_icon": "fab fa-accusoft", "description": "Specify the kingdom that the input represents.", - "help_text": "Specifies the kingdom that the input sample is derived from and/or you wish to screen for\n\n> \u26a0\ufe0f Prokka cannot annotate Eukaryotes.\n\nFor more information please check Prokka [documentation](https://github.com/tseemann/prokka).\n\n> Modifies tool parameter(s):\n> - Prokka: `--kingdom`", + "help_text": "Specifies the kingdom that the input sample is derived from and/or you wish to screen for\n\n> ⚠️ Prokka cannot annotate Eukaryotes.\n\nFor more information please check Prokka [documentation](https://github.com/tseemann/prokka).\n\n> Modifies tool parameter(s):\n> - Prokka: `--kingdom`", "enum": ["Archaea", "Bacteria", "Mitochondria", "Viruses"] }, "annotation_prokka_gcode": { From 641452a6e3bba1abc33e995124a44f6d6629c554 Mon Sep 17 00:00:00 2001 From: Anan Ibrahim <81744003+Darcy220606@users.noreply.github.com> Date: Tue, 12 Mar 2024 23:53:50 +0100 Subject: [PATCH 12/39] Update nextflow_schema.json Co-authored-by: Jasmin Frangenberg <73216762+jasmezz@users.noreply.github.com> --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 7b16f43a..e6d0662a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -641,7 +641,7 @@ }, "amp_ampcombi_cutoff": { "type": "number", - "default": 0, + "default": 0.0, "description": "Specify probability cutoff to filter AMPs", "help_text": "Specify the minimum probability an AMP hit must have to be retained in the final output file. Anything below this threshold will be removed.\n\nFor more information check AMPcombi [documentation](https://github.com/Darcy220606/AMPcombi).\n\n> Modifies tool parameter(s):\n> - AMPCOMBI: `--cutoff`", "fa_icon": "fas fa-sort-amount-up" From bf8536d99cfc7cc46ac35dc025dc042a34f6fe47 Mon Sep 17 00:00:00 2001 From: Anan Ibrahim <81744003+Darcy220606@users.noreply.github.com> Date: Tue, 12 Mar 2024 23:54:02 +0100 Subject: [PATCH 13/39] Update nextflow_schema.json Co-authored-by: Jasmin Frangenberg <73216762+jasmezz@users.noreply.github.com> --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index e6d0662a..b97af24b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -670,7 +670,7 @@ }, "arg_amrfinderplus_identmin": { "type": "number", - "default": -1, + "default": -1.0, "help_text": "Specify the minimum percentage amino-acid identity to reference protein or nucleotide identity for nucleotide reference must have if a BLAST alignment (based on methods: BLAST or PARTIAL) was detected, otherwise NA.\n\n If you specify `-1`, this means use a curated threshold if it exists and `0.9` otherwise.\n\nSetting this value to something other than `-1` will override any curated similarity cutoffs. For BLAST: alignment is > 90% of length and > 90% identity to a protein in the AMRFinderPlus database. For PARTIAL: alignment is > 50% of length, but < 90% of length and > 90% identity to the reference, and does not end at a contig boundary.\n\nFor more information check AMRFinderPlus [documentation](https://github.com/ncbi/amr/wiki/Running-AMRFinderPlus#--organism-option).\n\n> Modifies tool parameter(s):\n> - AMRFinderPlus: `--ident_min`", "description": "Minimum percent identity to reference sequence.", "fa_icon": "fas fa-angle-left" From 940c31a057b825093502a27a0b1e370d88fb8cf0 Mon Sep 17 00:00:00 2001 From: Anan Ibrahim <81744003+Darcy220606@users.noreply.github.com> Date: Tue, 12 Mar 2024 23:54:27 +0100 Subject: [PATCH 14/39] Update nextflow_schema.json Co-authored-by: Jasmin Frangenberg <73216762+jasmezz@users.noreply.github.com> --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index b97af24b..f1ad7bfa 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -978,7 +978,7 @@ "default": 1000, "description": "Minimum longest-contig length a sample must have to be screened with antiSMASH.", "fa_icon": "fas fa-ruler-horizontal", - "help_text": "This specifies the minimum length that the longest contig must have for the entire sample to be screened by antiSMASH.\n\nAny samples that do not reach this length will be not be sent to antiSMASH, therefore you will not receive output for these samples in your `--outdir`.\n\n> \u26a0\ufe0f This is not the same as `--bgc_antismash_contigminlength`, which specifies to only analyse contigs above that threshold but _within_ a sample that has already passed `--bgc_antismash_sampleminlength` sample filter!" + "help_text": "This specifies the minimum length that the longest contig must have for the entire sample to be screened by antiSMASH.\n\nAny samples that do not reach this length will be not be sent to antiSMASH, therefore you will not receive output for these samples in your `--outdir`.\n\n> ⚠️ This is not the same as `--bgc_antismash_contigminlength`, which specifies to only analyse contigs above that threshold but _within_ a sample that has already passed `--bgc_antismash_sampleminlength` sample filter!" }, "bgc_antismash_contigminlength": { "type": "integer", From 4f3fe990c284d2dcbfb3943d4797f87fdad54995 Mon Sep 17 00:00:00 2001 From: Anan Ibrahim <81744003+Darcy220606@users.noreply.github.com> Date: Wed, 13 Mar 2024 10:41:13 +0100 Subject: [PATCH 15/39] Update nextflow_schema.json Co-authored-by: Jasmin Frangenberg <73216762+jasmezz@users.noreply.github.com> --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index f1ad7bfa..e6b560cc 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -393,7 +393,7 @@ }, "annotation_prokka_evalue": { "type": "number", - "default": 1e-6, + "default": 0.000001, "description": "Minimum e-value cut-off.", "help_text": "Specifiy the minimum e-value used for filtering the alignment hits.\n\nFor more information please check Prokka [documentation](https://github.com/tseemann/prokka).\n\n> Modifies tool parameter(s):\n> - Prokka: `--evalue`", "fa_icon": "fas fa-sort-amount-down" From 5765fe1008d4ba1dae9243572bf853790ee131a6 Mon Sep 17 00:00:00 2001 From: Anan Ibrahim <81744003+Darcy220606@users.noreply.github.com> Date: Wed, 13 Mar 2024 10:41:25 +0100 Subject: [PATCH 16/39] Update nextflow_schema.json Co-authored-by: Jasmin Frangenberg <73216762+jasmezz@users.noreply.github.com> --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index e6b560cc..43a7565b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1146,7 +1146,7 @@ "type": "number", "description": "The p-value cutoff for protein domains to be included.", "fa_icon": "fas fa-filter", - "default": 1e-9, + "default": 0.000000001, "help_text": "The p-value cutoff for protein domains to be included.\n\nFor more information see the GECCO [documentation](https://github.com/zellerlab/GECCO).\n\n> Modifies tool parameter(s):\n> - GECCO: `--pfilter`" }, "bgc_gecco_threshold": { From 34f7bd23e0304d421dcec830d1d471b6b745e967 Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Thu, 14 Mar 2024 13:58:47 +0100 Subject: [PATCH 17/39] add versions in subworkflows --- subworkflows/local/amp.nf | 1 + subworkflows/local/arg.nf | 1 + subworkflows/local/bgc.nf | 1 + 3 files changed, 3 insertions(+) diff --git a/subworkflows/local/amp.nf b/subworkflows/local/amp.nf index 19bd23ef..844826a4 100644 --- a/subworkflows/local/amp.nf +++ b/subworkflows/local/amp.nf @@ -113,6 +113,7 @@ workflow AMP { // MERGE_TAXONOMY ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() MERGE_TAXONOMY_AMPCOMBI(ch_ampcombi_summaries, ch_mmseqs_taxonomy_list) + ch_versions = ch_versions.mix(MERGE_TAXONOMY_AMPCOMBI.out.versions) ch_tabix_input = Channel.of(['id':'ampcombi_complete_summary_taxonomy']) .combine(MERGE_TAXONOMY_AMPCOMBI.out.tsv) diff --git a/subworkflows/local/arg.nf b/subworkflows/local/arg.nf index 6b9ddf1f..30bfee64 100644 --- a/subworkflows/local/arg.nf +++ b/subworkflows/local/arg.nf @@ -150,6 +150,7 @@ workflow ARG { // MERGE_TAXONOMY ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() MERGE_TAXONOMY_HAMRONIZATION(HAMRONIZATION_SUMMARIZE.out.tsv, ch_mmseqs_taxonomy_list) + ch_versions = ch_versions.mix(MERGE_TAXONOMY_HAMRONIZATION.out.versions) emit: versions = ch_versions diff --git a/subworkflows/local/bgc.nf b/subworkflows/local/bgc.nf index d6421b7f..c0d9a164 100644 --- a/subworkflows/local/bgc.nf +++ b/subworkflows/local/bgc.nf @@ -192,6 +192,7 @@ workflow BGC { // MERGE_TAXONOMY ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() MERGE_TAXONOMY_COMBGC(ch_combgc_summaries, ch_mmseqs_taxonomy_list) + ch_versions = ch_versions.mix(MERGE_TAXONOMY_COMBGC.out.versions) emit: versions = ch_versions From 2d1f1359b1f73bde04bf9d1ff702f2f4050c0d5b Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Fri, 15 Mar 2024 10:24:22 +0100 Subject: [PATCH 18/39] update nextflow config latest dev --- nextflow.config | 5 ----- 1 file changed, 5 deletions(-) diff --git a/nextflow.config b/nextflow.config index 75d60950..7068daa1 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,11 +12,6 @@ params { // Input options input = null - // References - Not used in funcscan, left for template purposes - genome = null - igenomes_base = 's3://ngi-igenomes/igenomes/' - igenomes_ignore = false - // Taxonomy classification options taxonomy_mmseqs_classification_off = false taxonomy_mmseqs_save_intermedfiles = false From 1e7a0f0b650edf603f122c32324f7d61fe116172 Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Fri, 15 Mar 2024 10:33:11 +0100 Subject: [PATCH 19/39] fix linting --- workflows/funcscan.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index a7ee4ccc..6b0cf853 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -252,8 +252,8 @@ workflow FUNCSCAN { if ( params.run_arg_screening ) { if (params.arg_skip_deeparg) { ARG ( ch_prepped_input, - [], - ch_taxonomy_tsv + [], + ch_taxonomy_tsv .filter { meta, file -> if ( file.isEmpty() ) log.warn("Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") From 3f5622f1b62c04c321fdb71572df17cfd618b4be Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Tue, 19 Mar 2024 11:56:05 +0100 Subject: [PATCH 20/39] changelo --- CHANGELOG.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 17ef59fb..6893d0b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,8 +15,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Fixed` -- [#343](https://github.com/nf-core/funcscan/pull/343) Standardized the resulting workflow summary tables to always start with 'sample_id\tcontig_id\t..'. Reformated the the output of 'hamronization summarize' module. (by @darcy220606) -- [#348](https://github.com/nf-core/funcscan/pull/348) Updated samplesheet for pipeline tests to 'samplesheet_reduced.csv' with smaller datasets to reduce resource consumption. Updated prodigal module to fix pigz issue. Removed `tests/` from `.gitignore` (by @darcy220606) +- [#343](https://github.com/nf-core/funcscan/pull/343) Standardized the resulting workflow summary tables to always start with +'sample_id\tcontig_id\t..'. Reformated the the output of 'hamronization summarize' module. (by @darcy220606) +- [#348](https://github.com/nf-core/funcscan/pull/348) Updated samplesheet for pipeline tests to 'samplesheet_reduced.csv' with +smaller datasets to reduce resource consumption. Updated prodigal module to fix pigz issue. Removed `tests/` from `.gitignore` (by +@darcy220606) ### `Dependencies` From 2a21d96bcfddf07f45af8b7d7f60234fa99b7397 Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Tue, 19 Mar 2024 12:06:07 +0100 Subject: [PATCH 21/39] update CHNAGELOG --- CHANGELOG.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6893d0b1..2ee2cd7f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,10 +16,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Fixed` - [#343](https://github.com/nf-core/funcscan/pull/343) Standardized the resulting workflow summary tables to always start with -'sample_id\tcontig_id\t..'. Reformated the the output of 'hamronization summarize' module. (by @darcy220606) + 'sample_id\tcontig_id\t..'. Reformated the the output of 'hamronization summarize' module. (by @darcy220606) - [#348](https://github.com/nf-core/funcscan/pull/348) Updated samplesheet for pipeline tests to 'samplesheet_reduced.csv' with -smaller datasets to reduce resource consumption. Updated prodigal module to fix pigz issue. Removed `tests/` from `.gitignore` (by -@darcy220606) + smaller datasets to reduce resource consumption. Updated prodigal module to fix pigz issue. Removed `tests/` from `.gitignore` (by + @darcy220606) ### `Dependencies` From 4446bdbcea5d60f673e122c6b3a1e33c717fcad4 Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Tue, 19 Mar 2024 13:22:53 +0100 Subject: [PATCH 22/39] skip marfinderplus and deeparg --- CHANGELOG.md | 6 ++---- conf/test.config | 4 ++++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ee2cd7f..98cbf145 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,10 +15,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Fixed` -- [#343](https://github.com/nf-core/funcscan/pull/343) Standardized the resulting workflow summary tables to always start with - 'sample_id\tcontig_id\t..'. Reformated the the output of 'hamronization summarize' module. (by @darcy220606) -- [#348](https://github.com/nf-core/funcscan/pull/348) Updated samplesheet for pipeline tests to 'samplesheet_reduced.csv' with - smaller datasets to reduce resource consumption. Updated prodigal module to fix pigz issue. Removed `tests/` from `.gitignore` (by +- [#343](https://github.com/nf-core/funcscan/pull/343) Standardized the resulting workflow summary tables to always start with 'sample_id\tcontig_id\t..'. Reformated the output of 'hamronization summarize' module. (by @darcy220606) +- [#348](https://github.com/nf-core/funcscan/pull/348) Updated samplesheet for pipeline tests to 'samplesheet_reduced.csv' with smaller datasets to reduce resource consumption. Updated prodigal module to fix pigz issue. Removed `tests/` from `.gitignore` (by @darcy220606) ### `Dependencies` diff --git a/conf/test.config b/conf/test.config index 79fd38be..9a2cac05 100644 --- a/conf/test.config +++ b/conf/test.config @@ -27,6 +27,10 @@ params { run_arg_screening = true arg_fargene_hmmmodel = 'class_a,class_b_1_2' + arg_skip_amrfinderplus = true + arg_skip_deeparg = true run_amp_screening = true + amp_skip_amplify = true + amp_ampcombi_cutoff = 0.6 } From 798c457d09d6aad728afe03519d904b4f96a55ec Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Tue, 19 Mar 2024 13:50:36 +0100 Subject: [PATCH 23/39] try SILVA --- conf/test.config | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/conf/test.config b/conf/test.config index 9a2cac05..316c660e 100644 --- a/conf/test.config +++ b/conf/test.config @@ -25,12 +25,12 @@ params { annotation_tool = 'prodigal' - run_arg_screening = true - arg_fargene_hmmmodel = 'class_a,class_b_1_2' - arg_skip_amrfinderplus = true - arg_skip_deeparg = true - - run_amp_screening = true - amp_skip_amplify = true - amp_ampcombi_cutoff = 0.6 + run_arg_screening = true + arg_fargene_hmmmodel = 'class_a,class_b_1_2' + arg_skip_amrfinderplus = true + arg_skip_deeparg = true + taxonomy_mmseqs_databases_id = 'SILVA' + run_amp_screening = true + amp_skip_amplify = true + amp_ampcombi_cutoff = 0.6 } From 24a6440d8b5c986600018aa52233d853d2423ab7 Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Tue, 19 Mar 2024 14:07:50 +0100 Subject: [PATCH 24/39] change memory in test --- conf/test.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test.config b/conf/test.config index 316c660e..afbf17d1 100644 --- a/conf/test.config +++ b/conf/test.config @@ -16,7 +16,7 @@ params { // Limit resources so that this can run on GitHub Actions max_cpus = 2 - max_memory = '6.GB' + max_memory = '10.GB' max_time = '6.h' // Input data From eb276c80364164984f42f19b63c6f8d990814862 Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Tue, 19 Mar 2024 14:41:03 +0100 Subject: [PATCH 25/39] update test.config --- conf/test.config | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/conf/test.config b/conf/test.config index afbf17d1..83a4731a 100644 --- a/conf/test.config +++ b/conf/test.config @@ -23,14 +23,16 @@ params { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_reduced.csv' amp_hmmsearch_models = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm' - annotation_tool = 'prodigal' - - run_arg_screening = true - arg_fargene_hmmmodel = 'class_a,class_b_1_2' - arg_skip_amrfinderplus = true - arg_skip_deeparg = true - taxonomy_mmseqs_databases_id = 'SILVA' - run_amp_screening = true - amp_skip_amplify = true - amp_ampcombi_cutoff = 0.6 + //taxonomy_mmseqs_classification_off = true + annotation_tool = 'prodigal' + + run_arg_screening = true + arg_fargene_hmmmodel = 'class_a,class_b_1_2' + //arg_skip_amrfinderplus = true + //arg_skip_deeparg = true + + //taxonomy_mmseqs_databases_id = 'SILVA' + + run_amp_screening = true + //amp_skip_amplify = true } From dc9e85915cd8a0d90d1c6deff7afdd14edd03821 Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Tue, 19 Mar 2024 15:13:00 +0100 Subject: [PATCH 26/39] increase the RAM for CI tests --- conf/test.config | 8 +------- conf/test_bgc.config | 2 +- conf/test_nothing.config | 2 +- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/conf/test.config b/conf/test.config index 83a4731a..6b9b13f3 100644 --- a/conf/test.config +++ b/conf/test.config @@ -16,23 +16,17 @@ params { // Limit resources so that this can run on GitHub Actions max_cpus = 2 - max_memory = '10.GB' + max_memory = '8.5.GB' max_time = '6.h' // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_reduced.csv' amp_hmmsearch_models = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm' - //taxonomy_mmseqs_classification_off = true annotation_tool = 'prodigal' run_arg_screening = true arg_fargene_hmmmodel = 'class_a,class_b_1_2' - //arg_skip_amrfinderplus = true - //arg_skip_deeparg = true - - //taxonomy_mmseqs_databases_id = 'SILVA' run_amp_screening = true - //amp_skip_amplify = true } diff --git a/conf/test_bgc.config b/conf/test_bgc.config index 17df755d..35fc4c29 100644 --- a/conf/test_bgc.config +++ b/conf/test_bgc.config @@ -16,7 +16,7 @@ params { // Limit resources so that this can run on GitHub Actions max_cpus = 2 - max_memory = '6.GB' + max_memory = '8.5.GB' max_time = '6.h' // Input data diff --git a/conf/test_nothing.config b/conf/test_nothing.config index 5f850139..acc2385d 100644 --- a/conf/test_nothing.config +++ b/conf/test_nothing.config @@ -18,7 +18,7 @@ params { // Limit resources so that this can run on GitHub Actions max_cpus = 2 - max_memory = '6.GB' + max_memory = '8.5.GB' max_time = '6.h' // Input data From 44c20f13bc3a851c086dbbb643ec4de8f9f7090b Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Tue, 19 Mar 2024 15:19:08 +0100 Subject: [PATCH 27/39] update teh RAM for CI test 9.0GB --- conf/test.config | 2 +- conf/test_bgc.config | 2 +- conf/test_nothing.config | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/test.config b/conf/test.config index 6b9b13f3..51614d1c 100644 --- a/conf/test.config +++ b/conf/test.config @@ -16,7 +16,7 @@ params { // Limit resources so that this can run on GitHub Actions max_cpus = 2 - max_memory = '8.5.GB' + max_memory = '9.GB' max_time = '6.h' // Input data diff --git a/conf/test_bgc.config b/conf/test_bgc.config index 35fc4c29..578555b7 100644 --- a/conf/test_bgc.config +++ b/conf/test_bgc.config @@ -16,7 +16,7 @@ params { // Limit resources so that this can run on GitHub Actions max_cpus = 2 - max_memory = '8.5.GB' + max_memory = '9.GB' max_time = '6.h' // Input data diff --git a/conf/test_nothing.config b/conf/test_nothing.config index acc2385d..2509be4a 100644 --- a/conf/test_nothing.config +++ b/conf/test_nothing.config @@ -18,7 +18,7 @@ params { // Limit resources so that this can run on GitHub Actions max_cpus = 2 - max_memory = '8.5.GB' + max_memory = '9.GB' max_time = '6.h' // Input data From 0bc085cd9cfd67434ff3dbaab14a7d67b0f40cf9 Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Sat, 23 Mar 2024 12:53:20 +0100 Subject: [PATCH 28/39] update to revieweres comments --- .github/workflows/ci.yml | 31 +++++ CHANGELOG.md | 7 +- CITATIONS.md | 2 +- README.md | 2 +- bin/merge_taxonomy.py | 15 +- conf/modules.config | 57 +++++--- conf/test.config | 10 +- conf/test_bgc.config | 2 +- conf/test_taxonomy.config | 38 ++++++ docs/output.md | 44 +++--- docs/usage.md | 7 +- modules/local/merge_taxonomy_ampcombi.nf | 2 +- modules/local/merge_taxonomy_combgc.nf | 2 +- modules/local/merge_taxonomy_hamronization.nf | 2 +- nextflow.config | 30 ++-- nextflow_schema.json | 95 ++++++------- subworkflows/local/amp.nf | 30 ++-- subworkflows/local/arg.nf | 46 ++++--- subworkflows/local/bgc.nf | 23 +++- subworkflows/local/taxa.nf | 55 ++++++++ workflows/funcscan.nf | 129 +++++++++++------- 21 files changed, 412 insertions(+), 217 deletions(-) create mode 100644 conf/test_taxonomy.config create mode 100644 subworkflows/local/taxa.nf diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e29e68c5..f05ceb88 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -77,3 +77,34 @@ jobs: - name: Run pipeline with test data (BGC workflow) run: | nextflow run ${GITHUB_WORKSPACE} -profile test_bgc,docker --outdir ./results ${{ matrix.parameters }} --bgc_skip_deepbgc + + test_taxonomy: + name: Run pipeline with test data (AMP, ARG and BGC taxonomy workflows) + # Only run on push if this is the nf-core dev branch (merged PRs) + if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/funcscan') }}" + runs-on: ubuntu-latest + strategy: + matrix: + NXF_VER: + - "23.04.0" + - "latest-everything" + parameters: + - "--annotation_tool prodigal" + - "--annotation_tool prokka" + - "--annotation_tool bakta --annotation_bakta_db_downloadtype light" + + steps: + - name: Check out pipeline code + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + + - name: Install Nextflow + uses: nf-core/setup-nextflow@v1 + with: + version: "${{ matrix.NXF_VER }}" + + - name: Disk space cleanup + uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 + + - name: Run pipeline with test data (AMP, ARG and BGC taxonomy workflows) + run: | + nextflow run ${GITHUB_WORKSPACE} -profile test_taxonomy,docker --outdir ./results ${{ matrix.parameters }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 98cbf145..d9c1d617 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,11 +15,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Fixed` -- [#343](https://github.com/nf-core/funcscan/pull/343) Standardized the resulting workflow summary tables to always start with 'sample_id\tcontig_id\t..'. Reformated the output of 'hamronization summarize' module. (by @darcy220606) -- [#348](https://github.com/nf-core/funcscan/pull/348) Updated samplesheet for pipeline tests to 'samplesheet_reduced.csv' with smaller datasets to reduce resource consumption. Updated prodigal module to fix pigz issue. Removed `tests/` from `.gitignore` (by - @darcy220606) - -### `Dependencies` +- [#343](https://github.com/nf-core/funcscan/pull/343) Standardized the resulting workflow summary tables to always start with 'sample_id\tcontig_id\t..'. Reformatted the output of `hamronization/summarize` module. (by @darcy220606) +- [#348](https://github.com/nf-core/funcscan/pull/348) Updated samplesheet for pipeline tests to 'samplesheet_reduced.csv' with smaller datasets to reduce resource consumption. Updated prodigal module to fix pigz issue. Removed `tests/` from `.gitignore`. (by @darcy220606) | Tool | Previous version | New version | | ------- | ---------------- | ----------- | diff --git a/CITATIONS.md b/CITATIONS.md index fd346404..29fc2a21 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -92,7 +92,7 @@ - [MMseqs2](https://doi.org/10.1093bioinformatics/btab184) - > Mirdita M., Steinegger M., Breitwieser F., Söding J., Levy Karin E. (2021). Fastand sensitive taxonomic assignment to metagenomic contigs, Bioinformatics, 37(18),3029–3031. [DOI: 10.1093/bioinformatics/btab184](https://doi.org/10.1093bioinformatics/btab184) + > Mirdita, M., Steinegger, M., Breitwieser, F., Söding, J., Levy Karin, E. (2021). Fast and sensitive taxonomic assignment to metagenomic contigs. Bioinformatics, 37(18),3029–3031. [DOI: 10.1093/bioinformatics/btab184](https://doi.org/10.1093/bioinformatics/btab184) ## Software packaging/containerisation tools diff --git a/README.md b/README.md index 1ff06a18..d56dbc47 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ The nf-core/funcscan AWS full test dataset are contigs generated by the MGnify s ## Pipeline summary -1. Taxonomic classification of contigs from **of prokaryotic origin** with [`MMseqs2`](https://github.com/soedinglab/MMseqs2). +1. Taxonomic classification of contigs of **prokaryotic origin** with [`MMseqs2`](https://github.com/soedinglab/MMseqs2) 2. Annotation of assembled prokaryotic contigs with [`Prodigal`](https://github.com/hyattpd/Prodigal), [`Pyrodigal`](https://github.com/althonos/pyrodigal), [`Prokka`](https://github.com/tseemann/prokka), or [`Bakta`](https://github.com/oschwengers/bakta) 3. Screening contigs for antimicrobial peptide-like sequences with [`ampir`](https://cran.r-project.org/web/packages/ampir/index.html), [`Macrel`](https://github.com/BigDataBiology/macrel), [`HMMER`](http://hmmer.org/), [`AMPlify`](https://github.com/bcgsc/AMPlify) 4. Screening contigs for antibiotic resistant gene-like sequences with [`ABRicate`](https://github.com/tseemann/abricate), [`AMRFinderPlus`](https://github.com/ncbi/amr), [`fARGene`](https://github.com/fannyhb/fargene), [`RGI`](https://card.mcmaster.ca/analyze/rgi), [`DeepARG`](https://bench.cs.vt.edu/deeparg) diff --git a/bin/merge_taxonomy.py b/bin/merge_taxonomy.py index 6b5db97b..f901ed91 100755 --- a/bin/merge_taxonomy.py +++ b/bin/merge_taxonomy.py @@ -35,7 +35,7 @@ subparsers = parser.add_subparsers(required=True) ######################################### -# SUBPARSERS : AMPCOMBI +# SUBPARSER: AMPCOMBI ######################################### ampcombi_parser = subparsers.add_parser('ampcombi_taxa') @@ -44,7 +44,7 @@ ampcombi_parser.add_argument("--taxonomy", dest="taxa1", nargs='+', help="Enter the list of taxonomy files for all samples. ") ######################################### -# SUBPARSERS : COMBGC +# SUBPARSER: COMBGC ######################################### combgc_parser = subparsers.add_parser('combgc_taxa') @@ -53,7 +53,7 @@ combgc_parser.add_argument("--taxonomy", dest="taxa2", nargs='+', help="Enter the list of taxonomy files for all samples. ") ######################################### -# SUBPARSERS : HAMRONIZATION +# SUBPARSER: HAMRONIZATION ######################################### hamronization_parser = subparsers.add_parser('hamronization_taxa') @@ -74,7 +74,6 @@ def reformat_mmseqs_taxonomy(mmseqs_taxonomy): lineage = str(row['mmseqs_lineage_contig']) if 'Eukaryota' in lineage or 'root' in lineage: mmseqs2_df.at[i, 'mmseqs_lineage_contig'] = np.nan - #mmseqs2_df['mmseqs_lineage_contig'].unique() # insert the sample name in the first column according to the file basename file_basename = os.path.basename(mmseqs_taxonomy) filename = os.path.splitext(file_basename)[0] @@ -82,7 +81,7 @@ def reformat_mmseqs_taxonomy(mmseqs_taxonomy): return mmseqs2_df ######################################### -# FUNCTION : AMPCOMBI +# FUNCTION: AMPCOMBI ######################################### def ampcombi_taxa(args): merged_df = pd.DataFrame() @@ -128,7 +127,7 @@ def ampcombi_taxa(args): merged_df.to_csv('ampcombi_complete_summary_taxonomy.tsv', sep='\t', index=False) ######################################### -# FUNCTION : COMBGC +# FUNCTION: COMBGC ######################################### def combgc_taxa(args): merged_df = pd.DataFrame() @@ -172,7 +171,7 @@ def combgc_taxa(args): merged_df.to_csv('combgc_complete_summary_taxonomy.tsv', sep='\t', index=False) ######################################### -# FUNCTION : HAMRONIZATION +# FUNCTION: HAMRONIZATION ######################################### def hamronization_taxa(args): merged_df = pd.DataFrame() @@ -220,7 +219,7 @@ def hamronization_taxa(args): merged_df.to_csv('hamronization_complete_summary_taxonomy.tsv', sep='\t', index=False) ######################################### -# SUBPARSERS : DEFAULT +# SUBPARSERS: DEFAULT ######################################### ampcombi_parser.set_defaults(func=ampcombi_taxa) combgc_parser.set_defaults(func=combgc_taxa) diff --git a/conf/modules.config b/conf/modules.config index b296fed7..9503f3d0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -46,49 +46,50 @@ process { withName: MMSEQS_DATABASES { publishDir = [ - path: { "${params.outdir}/databases/" }, + path: { "${params.outdir}/databases/mmseqs/" }, // dir==mmseqs_database/ mode: params.publish_dir_mode, enabled: params.save_databases, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] ext.args = [ - params.taxonomy_mmseqs_databases_savetmp ? "" : "--remove-tmp-files" , + params.mmseqs_databases_savetmp ? "" : "--remove-tmp-files" , ].join(' ').trim() } withName: MMSEQS_CREATEDB { publishDir = [ - path: { "${params.outdir}/taxonomy/mmseqs_createdb/" }, + path: { "${params.outdir}/databases/mmseqs/mmseqs_createdb/" }, mode: params.publish_dir_mode, - enabled: params.taxonomy_mmseqs_save_intermedfiles, + enabled: params.save_databases, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } withName: MMSEQS_TAXONOMY { publishDir = [ - path: { "${params.outdir}/taxonomy/mmseqs_taxonomy/" }, + path: { "${params.outdir}/databases/mmseqs/mmseqs_taxonomy/" }, mode: params.publish_dir_mode, - enabled: params.taxonomy_mmseqs_save_intermedfiles, + enabled: params.save_databases, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] ext.args = [ - params.taxonomy_mmseqs_taxonomy_savetmp ? "" : "--remove-tmp-files", - "--search-type ${params.taxonomy_mmseqs_taxonomy_searchtype}", - "--lca-ranks ${params.taxonomy_mmseqs_taxonomy_lcaranks}", - "--tax-lineage ${params.taxonomy_mmseqs_taxonomy_taxlineage}", - "-s ${params.taxonomy_mmseqs_taxonomy_sensitivity}", - "--orf-filter-s ${params.taxonomy_mmseqs_taxonomy_orffilters}", - "--lca-mode ${params.taxonomy_mmseqs_taxonomy_lcamode}", - "--vote-mode ${params.taxonomy_mmseqs_taxonomy_votemode}", + params.mmseqs_taxonomy_savetmp ? "" : "--remove-tmp-files", + "--search-type ${params.mmseqs_taxonomy_searchtype}", + "--lca-ranks ${params.mmseqs_taxonomy_lcaranks}", + "--tax-lineage ${params.mmseqs_taxonomy_taxlineage}", + "-s ${params.mmseqs_taxonomy_sensitivity}", + "--orf-filter-s ${params.mmseqs_taxonomy_orffilters}", + "--lca-mode ${params.mmseqs_taxonomy_lcamode}", + "--vote-mode ${params.mmseqs_taxonomy_votemode}", ].join(' ').trim() } withName: MMSEQS_CREATETSV { publishDir = [ - path: { "${params.outdir}/taxonomy/mmseqs_createtsv/${meta.id}" }, + path: { "${params.outdir}/taxonomic_classification/mmseqs_createtsv/${meta.id}/" }, mode: params.publish_dir_mode, - enabled: params.taxonomy_mmseqs_save_intermedfiles, + enabled: params.run_taxonomic_classification, + pattern: "*.tsv", saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } @@ -499,11 +500,19 @@ process { publishDir = [ path: { "${params.outdir}/reports/hamronization_summarize" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { (params.run_taxonomic_classification == false) ? it : null } ] } withName: MERGE_TAXONOMY_HAMRONIZATION { + publishDir = [ + path: { "${params.outdir}/reports/hamronization_summarize" }, + mode: params.publish_dir_mode, + saveAs: { _ -> null } //do not save the file + ] + } + + withName: ARG_TABIX_BGZIP { publishDir = [ path: { "${params.outdir}/reports/hamronization_summarize" }, mode: params.publish_dir_mode, @@ -525,12 +534,12 @@ process { withName: MERGE_TAXONOMY_AMPCOMBI { publishDir = [ path: { "${params.outdir}/reports/ampcombi" }, - mode: params.taxonomy_mmseqs_save_intermedfiles, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + mode: params.publish_dir_mode, + saveAs: { _ -> null } //do not save the file ] } - withName: TABIX_BGZIP { + withName: AMP_TABIX_BGZIP { publishDir = [ path: { "${params.outdir}/reports/ampcombi" }, mode: params.publish_dir_mode, @@ -547,6 +556,14 @@ process { } withName: MERGE_TAXONOMY_COMBGC { + publishDir = [ + path: { "${params.outdir}/reports/combgc" }, + mode: params.publish_dir_mode, + saveAs: { _ -> null } //do not save the file + ] + } + + withName: BGC_TABIX_BGZIP { publishDir = [ path: { "${params.outdir}/reports/combgc" }, mode: params.publish_dir_mode, diff --git a/conf/test.config b/conf/test.config index 51614d1c..9e95a491 100644 --- a/conf/test.config +++ b/conf/test.config @@ -16,17 +16,17 @@ params { // Limit resources so that this can run on GitHub Actions max_cpus = 2 - max_memory = '9.GB' + max_memory = '8.GB' max_time = '6.h' // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_reduced.csv' amp_hmmsearch_models = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm' - annotation_tool = 'prodigal' + annotation_tool = 'prodigal' - run_arg_screening = true - arg_fargene_hmmmodel = 'class_a,class_b_1_2' + run_arg_screening = true + arg_fargene_hmmmodel = 'class_a,class_b_1_2' - run_amp_screening = true + run_amp_screening = true } diff --git a/conf/test_bgc.config b/conf/test_bgc.config index 578555b7..89228579 100644 --- a/conf/test_bgc.config +++ b/conf/test_bgc.config @@ -16,7 +16,7 @@ params { // Limit resources so that this can run on GitHub Actions max_cpus = 2 - max_memory = '9.GB' + max_memory = '8.GB' max_time = '6.h' // Input data diff --git a/conf/test_taxonomy.config b/conf/test_taxonomy.config new file mode 100644 index 00000000..d8743664 --- /dev/null +++ b/conf/test_taxonomy.config @@ -0,0 +1,38 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/funcscan -profile test_taxonomy, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Taxonomic classification test profile' + config_profile_description = 'Minimal test dataset to check taxonomic classification workflow function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '8.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_reduced.csv' + bgc_hmmsearch_models = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm' + amp_hmmsearch_models = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm' + + run_taxonomic_classification = true + annotation_tool = 'prodigal' + + run_arg_screening = true + arg_skip_deeparg = true + arg_skip_amrfinderplus = true + + run_amp_screening = true + + run_bgc_screening = true + bgc_skip_deepbgc = true +} diff --git a/docs/output.md b/docs/output.md index e8fa59fe..83d58ec2 100644 --- a/docs/output.md +++ b/docs/output.md @@ -8,7 +8,7 @@ The output of nf-core/funcscan provides reports for each of the functional group - antimicrobial peptides (tools: [Macrel](https://github.com/BigDataBiology/macrel), [AMPlify](https://github.com/bcgsc/AMPlify), [ampir](https://ampir.marine-omics.net), [hmmsearch](http://hmmer.org) – summarised by [AMPcombi](https://github.com/Darcy220606/AMPcombi)) - biosynthetic gene clusters (tools: [antiSMASH](https://docs.antismash.secondarymetabolites.org), [DeepBGC](https://github.com/Merck/deepbgc), [GECCO](https://gecco.embl.de), [hmmsearch](http://hmmer.org) – summarised by [comBGC](#combgc)) -As a general workflow, we recommend to first look at the summary reports ([ARGs](#hamronization), [AMPs](#ampcombi), [BGCs](#combgc)), to get a general overview of what hits have been found across all the tools of each functional group. After which, you can explore the specific output directories of each tool to get more detailed information about each result. The tool-specific output directories also includes the output from the functional annotation steps of either [prokka](https://github.com/tseemann/prokka), [pyrodigal](https://github.com/althonos/pyrodigal), [prodigal](https://github.com/hyattpd/Prodigal), or [Bakta](https://github.com/oschwengers/bakta) if the `--save_annotations` flag was set and taxonomic classifications from [MMseqs2](https://github.com/soedinglab/MMseqs2) if the `taxonomy_mmseqs_save_intermedfiles` flag was set. +As a general workflow, we recommend to first look at the summary reports ([ARGs](#hamronization), [AMPs](#ampcombi), [BGCs](#combgc)), to get a general overview of what hits have been found across all the tools of each functional group. After which, you can explore the specific output directories of each tool to get more detailed information about each result. The tool-specific output directories also includes the output from the functional annotation steps of either [prokka](https://github.com/tseemann/prokka), [pyrodigal](https://github.com/althonos/pyrodigal), [prodigal](https://github.com/hyattpd/Prodigal), or [Bakta](https://github.com/oschwengers/bakta) if the `--save_annotations` flag was set. Additionally, taxonomic classifications from [MMseqs2](https://github.com/soedinglab/MMseqs2) are saved if the `taxonomy_mmseqs_save_intermedfiles` flag was set. Similarly, all downloaded databases are saved (i.e. from [MMseqs2](https://github.com/soedinglab/MMseqs2), [antiSMASH](https://docs.antismash.secondarymetabolites.org), [AMRFinderPlus](https://www.ncbi.nlm.nih.gov/pathogens/antimicrobial-resistance/AMRFinder), [Bakta](https://github.com/oschwengers/bakta), [DeepARG](https://bitbucket.org/gusphdproj/deeparg-ss/src/master), and/or [AMPcombi](https://github.com/Darcy220606/AMPcombi)) into the output directory `/downloads/` if the `--save_databases` flag was set. @@ -18,9 +18,7 @@ The directories listed below will be created in the results directory (specified ```console results/ -├── taxonomy/ -| ├── mmseqs_createdb/ -| ├── mmseqs_taxonomy/ +├── taxonomic_classification/ | └── mmseqs_createtsv/ ├── annotation/ | ├── bakta/ @@ -58,7 +56,7 @@ work/ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes prokaryotic sequence data through the following steps: -Taxonomy classification of contigs with: +Taxonomy classification of nucleotide sequences with: - [MMseqs2](https://github.com/soedinglab/MMseqs2) (default) - for contig taxonomic classification using 2bLCA. @@ -103,22 +101,16 @@ Output Summaries: ### Taxonomic classification tool -[MMseqs2](#MMseqs2) -
Output files -- `taxonomy/mmseqs2_createtsv/` +- `taxonomic_classification/mmseqs2_createtsv/` - `/`: - `*.tsv`: tab seperated table containing the taxonomic lineage of every contig when available. -- `reports//_complete_summary_taxonomy.tsv`: tab seperated table containing the taxonomic lineage of every contig when available along with teh results from the summary tables. - > Descriptions taken from the [MMseqs2 documentation](https://github.com/soedinglab/MMseqs2/wiki) - +- `reports//_complete_summary_taxonomy.tsv.gz`: tab seperated table containing the concatenated results from the summary tables along with the taxonomic classification if the parameter `run_taxonomic_classification` is called.
-[MMseqs2](https://github.com/soedinglab/MMseqs2) classifies the taxonomic lineage of contigs based on the least common ancestor. The taxonomic lineage produced is also added to the final workflow summaries to annotate the potential source bacteria of the BGC, AMP, and ARG. - -### Annotation tools +[MMseqs2](https://github.com/soedinglab/MMseqs2) classifies the taxonomic lineage of contigs based on the last common ancestor. The inferred taxonomic lineages are included in the final workflow summaries to annotate the potential source bacteria of the identified AMPs, ARGs, and/or BGCs. [Pyrodigal](#pyrodigal), [Prodigal](#prodigal), [Prokka](#prokka), [Bakta](#bakta) @@ -428,17 +420,13 @@ Output Summaries: Output files - `ampcombi/` - - `ampcombi_complete_summary_taxonomy.tsv`: summarised output from all AMP workflow tools with taxonomic assignment in tsv format - - `ampcombi_complete_summary_taxonomy.tsv.gz`: summarised output from all AMP workflow tools with taxonomic assignment in compressed tsv format + - `ampcombi_complete_summary.tsv`: tab seperated table containing the concatenated results from the ampcombi summary tables. This is the output given when the taxonomic classification is not activated, i.e., pipeline default. + - `ampcombi_complete_summary_taxonomy.tsv.gz`: summarised output from all AMP workflow tools with taxonomic assignment in compressed tsv format. - `ampcombi.log`: a log file generated by ampcombi - - `/*_ampcombi.csv`: summarised output in csv for each sample - - `/*_amp.faa*`: fasta file containing the amino acid sequences for all AMP hits for each sample - - `/*_diamond_matches.txt*`: alignment file generated by DIAMOND for each sample - - - -
-AMP summary table header descriptions + - `/*_ampcombi.csv`: summarised output in csv for each sample + - `/*_amp.faa*`: fasta file containing the amino acid sequences for all AMP hits for each sample + - `/*_diamond_matches.txt*`: alignment file generated by DIAMOND for each sample + AMP summary table header descriptions | Table column | Description | | ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -489,9 +477,10 @@ Output Summaries:
Output files -- `hamronization/` one of the following: +- `hamronization_summarize/` one of the following: - `hamronization_combined_report.json`: summarised output in .json format - - `hamronization_combined_report.tsv`: summarised output in .tsv format + - `hamronization_combined_report.tsv`: summarised output in .tsv format when the taxonomic classification is turned off (pipeline default). + - `hamronization_combined_report.tsv.gz`: summarised output in zipped format when the taxonomic classification is turned on by `run_taxonomic_classification`. - `hamronization_combined_report.html`: interactive output in .html format
@@ -547,7 +536,8 @@ Output Summaries: Output files - `comBGC/` - - `combgc_complete_summary.tsv`: summarised output from all BGC detection tools used in tsv format (all samples concatenated). + - `combgc_complete_summary.tsv`: summarised output from all BGC detection tools used in tsv format (all samples concatenated). This is the output given when the taxonomic classification is not activated, i.e., pipeline default. + - `combgc_complete_summary.tsv.gz`: summarised output in zipped format from all BGC detection tools used in tsv format (all samples concatenated) along with the taxonomic classification obtained when `run_taxonomic_classification` is activated. - `*/combgc_summary.tsv`: summarised output from all applied BGC detection tools in tsv format for each sample.
diff --git a/docs/usage.md b/docs/usage.md index b5fb3dd7..7f364405 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,7 +6,7 @@ ## Introduction -nf-core/funcscan is a pipeline for efficient and parallelised screening of long nucleotide sequences such as contigs for antimicrobial peptide genes, antimicrobial resistance genes, and biosynthetic gene clusters. +nf-core/funcscan is a pipeline for efficient and parallelised screening of long nucleotide sequences such as contigs for antimicrobial peptide genes, antimicrobial resistance genes, and biosynthetic gene clusters. It further identifies their taxonomic orgin. ## Running the pipeline @@ -18,13 +18,14 @@ nextflow run nf-core/funcscan --input samplesheet.csv --outdir -profile This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. -To run any of the three screening workflows (AMP, ARG, and/or BGC), switch them on by adding the respective flag(s) to the command: +To run any of the three screening workflows (AMP, ARG, and/or BGC) or taxonomic classification (currently done with [MMseqs2](https://github.com/soedinglab/MMseqs2)), switch them on by adding the respective flag(s) to the command: - `--run_amp_screening` - `--run_arg_screening` - `--run_bgc_screening` +- `--run_taxonomic_classification` -When switched on, all tools of the given workflow will be run by default. If you don't need specific tools, you can explicitly skip them. +When switched on, all tools of the given workflow will be run by default. If you don't need specific tools, you can explicitly skip them. For the taxonomic classification, MMseqs2 is currently the only tool implemented in the pipline. **Example:** You want to run AMP and ARG screening but you don't need the DeepARG tool of the ARG workflow and the Macrel tool of the AMP workflow. Your command would be: diff --git a/modules/local/merge_taxonomy_ampcombi.nf b/modules/local/merge_taxonomy_ampcombi.nf index 6a7c0185..c9f3e30d 100644 --- a/modules/local/merge_taxonomy_ampcombi.nf +++ b/modules/local/merge_taxonomy_ampcombi.nf @@ -12,7 +12,7 @@ process MERGE_TAXONOMY_AMPCOMBI { output: path("ampcombi_complete_summary_taxonomy.tsv") , emit: tsv - path "versions.yml" , emit: versions + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when diff --git a/modules/local/merge_taxonomy_combgc.nf b/modules/local/merge_taxonomy_combgc.nf index 263d6700..95043310 100644 --- a/modules/local/merge_taxonomy_combgc.nf +++ b/modules/local/merge_taxonomy_combgc.nf @@ -12,7 +12,7 @@ process MERGE_TAXONOMY_COMBGC { output: path("combgc_complete_summary_taxonomy.tsv") , emit: tsv - path "versions.yml" , emit: versions + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when diff --git a/modules/local/merge_taxonomy_hamronization.nf b/modules/local/merge_taxonomy_hamronization.nf index 97d066ff..9c50bc12 100644 --- a/modules/local/merge_taxonomy_hamronization.nf +++ b/modules/local/merge_taxonomy_hamronization.nf @@ -12,7 +12,7 @@ process MERGE_TAXONOMY_HAMRONIZATION { output: path("hamronization_complete_summary_taxonomy.tsv") , emit: tsv - path "versions.yml" , emit: versions + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when diff --git a/nextflow.config b/nextflow.config index 7068daa1..c798b050 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,21 +13,21 @@ params { input = null // Taxonomy classification options - taxonomy_mmseqs_classification_off = false - taxonomy_mmseqs_save_intermedfiles = false - - taxonomy_mmseqs_databases_localpath = null - taxonomy_mmseqs_databases_id = 'Kalamari' - taxonomy_mmseqs_databases_savetmp = false - - taxonomy_mmseqs_taxonomy_savetmp = false - taxonomy_mmseqs_taxonomy_searchtype = 2 - taxonomy_mmseqs_taxonomy_lcaranks = 'kingdom,phylum,class,order,family,genus,species' - taxonomy_mmseqs_taxonomy_taxlineage = 1 - taxonomy_mmseqs_taxonomy_sensitivity = '5.0' - taxonomy_mmseqs_taxonomy_orffilters = '2.0' - taxonomy_mmseqs_taxonomy_lcamode = 3 - taxonomy_mmseqs_taxonomy_votemode = 1 + run_taxonomic_classification = false + taxonomic_classification_tool = 'mmseqs2' + + mmseqs_databases_localpath = null + mmseqs_databases_id = 'Kalamari' + mmseqs_databases_savetmp = false + + mmseqs_taxonomy_savetmp = false + mmseqs_taxonomy_searchtype = 2 + mmseqs_taxonomy_lcaranks = 'kingdom,phylum,class,order,family,genus,species' + mmseqs_taxonomy_taxlineage = 1 + mmseqs_taxonomy_sensitivity = '5.0' + mmseqs_taxonomy_orffilters = '2.0' + mmseqs_taxonomy_lcamode = 3 + mmseqs_taxonomy_votemode = 1 // Annotation options annotation_tool = 'pyrodigal' diff --git a/nextflow_schema.json b/nextflow_schema.json index 8c2fb209..a0d58dec 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -68,109 +68,110 @@ }, "fa_icon": "fas fa-network-wired" }, - "taxonomy": { - "title": "Taxonomy", + "taxonomic_classification": { + "title": "Taxonomic classification", "type": "object", - "description": "These options influence whether to activate the taxonomic classification of contigs", + "description": "These options influence whether to activate the taxonomic classification of the input nucleotide sequences.", "default": "", - "fa_icon": "fas fa-address-book", "properties": { - "taxonomy_mmseqs_classification_off": { + "run_taxonomic_classification": { "type": "boolean", - "fa_icon": "fas fa-ad", - "description": "Activates the taxonomic classification of input contigs." + "description": "Activates the taxonomic classification of input mucleotide sequences.", + "help_text": "This flag turns on the taxonomic classification of input nucleotide sequences. The taxonomic annotations should be turned on if the input metagenomes' bacterial sources are unknown, which can help identify the source of the AMP, BGC or ARG hit obtained for laboratory experiments. This flag should be turned off (which is by deafult) if the input nucleotide sequences represnet a single known genome or *nf-core/mag* was run beforhand. Turning on this flag relatively decreases the pipleine speed and requires >8GB RAM. Due to the size of the resulting table, the final complete summary is in a zipped format.", + "fa_icon": "fas fa-ad" }, - "taxonomy_mmseqs_save_intermedfiles": { - "type": "boolean", - "fa_icon": "fas fa-file-download", - "description": "Save any intermediate files created in the taxonomic classification step.", - "help_text": "This flag saves to the output folder all the databases created to generate the final taxonomic lineages." + "taxonomic_classification_tool": { + "type": "string", + "default": "mmseqs2", + "help_text": "This flag specifies which tool for taxonomic classification should be activated. At the moment only 'MMseqs2' is incorporated in the pipeline.", + "description": "Specifies the tool used for taxonomic classification.", + "fa_icon": "fas fa-check-circle" } - } + }, + "fa_icon": "fas fa-address-book" }, - "taxonomy_mmseqs_databases": { - "title": "Taxonomy: MMseqs databases", + "taxonomic_classification_mmseqs2_databases": { + "title": "Taxonomic classification: MMseqs2 databases", "type": "object", "description": "These parameters influence the database to be used in classifying the taxonomy.", "default": "", "properties": { - "taxonomy_mmseqs_databases_localpath": { - "type": "string", - "description": "Specifiy a path to MMsqes2 formatted database.", + "mmseqs_databases_localpath": { + "description": "Specify a path to MMseqs2-formatted database.", "help_text": "Specify a path to a database that is prepared in MMseqs2 format as detailed in the [documentation](https://mmseqs.com/latest/userguide.pdf).", "fa_icon": "fab fa-stackpath" }, - "taxonomy_mmseqs_databases_id": { + "mmseqs_databases_id": { "type": "string", "default": "Kalamari", - "help_text": "Specify the MMseqs2 formatted database to use to classify the input contigs. This can be a nucleotide or amino acid database, however the database chosen must include taxonomic classifications. For example both GTDB, an amico acid database and SILVA, a nucleotide database are both databases provided by MMseqs2 that have taxonomic classifications. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf).\\n\\n> Modifies tool parameter(s):\\n> - mmseqs databases \".", + "help_text": "Specify which MMseqs2-formatted database to use to classify the input contigs. This can be a nucleotide or amino acid database that includes taxonomic classifications. For example, both GTDB (an amico acid database) and SILVA (a nucleotide database) are supported by MMseqs2. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf).\\n\\n> Modifies tool parameter(s):\\n> - mmseqs databases \".", "description": "Specify the label of the database to be used.", "fa_icon": "fas fa-address-card" }, - "taxonomy_mmseqs_databases_savetmp": { + "mmseqs_databases_savetmp": { "type": "boolean", - "help_text": "This flag saves in the output folder the temporary files created when downloading the database and creating in the mmseqs2 format. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs databases: `--remove-tmp-files`\".", + "help_text": "This flag saves the temporary files from downloading the database and formatting them in the MMseqs2 format into the output folder. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs databases: `--remove-tmp-files`\".", "description": "Specify whether the temporary files should be saved.", "fa_icon": "fas fa-file-download" } }, "fa_icon": "far fa-address-card" }, - "taxonomy_mmseqs2_taxonomy": { - "title": "Taxonomy: MMseqs2 taxonomy", + "taxonomic_classification_mmseqs2_taxonomy": { + "title": "Taxonomic classification: MMseqs2 taxonomy", "type": "object", "description": "These parameters influence the taxonomic classification step.", "default": "", "properties": { - "taxonomy_mmseqs_taxonomy_savetmp": { + "mmseqs_taxonomy_savetmp": { "type": "boolean", - "help_text": "This flag saves in the output folder the temporary files created when creating the taxonomy database and final 'tsv' file. Save More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--remove-tmp-files`\".\n", + "help_text": "This flag saves the temporary files from creating the taxonomy database and the final `tsv` file into the output folder. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--remove-tmp-files`\".\n", "description": "Specify whether to save the temporary files.", "fa_icon": "fab fa-adversal" }, - "taxonomy_mmseqs_taxonomy_searchtype": { + "mmseqs_taxonomy_searchtype": { "type": "integer", "default": 2, "help_text": "Specify the type of alignment to be carried out between the query database and the reference MMseqs2 database. This can be set to '0' for automatic detection, '1' for amino acid alignment, '2' for translating the inputs and running the alignment on the translated sequences, '3' nucleotide based alignment and '4' for the translated nucleotide sequences alignment. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--search-type`\".", "description": "Specify the alignment type between database and query.", "fa_icon": "fas fa-align-center" }, - "taxonomy_mmseqs_taxonomy_lcaranks": { + "mmseqs_taxonomy_lcaranks": { "type": "string", "default": "kingdom,phylum,class,order,family,genus,species", - "help_text": "Specify the taxonomic ranks to include in the taxonomic lineage column in the final '.tsv' file. For example, 'kingdom,phylum,class,order,family,genus,species'. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--lca-ranks`\".", + "help_text": "Specify the taxonomic ranks to include in the taxonomic lineage column in the final `.tsv` file. For example, 'kingdom,phylum,class,order,family,genus,species'. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--lca-ranks`\".", "description": "Specify the taxonomic levels to display in the result table.", "fa_icon": "fas fa-stream" }, - "taxonomy_mmseqs_taxonomy_taxlineage": { + "mmseqs_taxonomy_taxlineage": { "type": "integer", "default": 1, - "help_text": "This flag specifies whether the taxonomic lineage should be included in the output '.tsv' file. The taxonomic lineage is obtained due to the internal module of mmseqs taxonomy that implements the least common ancestor to classify the taxonomy. A value of '0' writes no taxonomic lineage, a value of '1' adds a column with the full lineage names prefixed with abbreviation of the lineage level, e.g., k_Prokaryotes;p_Bacteroidetes;c_....;o_....;f_....;g_....;s_...., while a value of '2' adds a column with the full NCBI taxids lineage,e.g., 1324;2345;4546;5345. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--tax-lineage`\".", + "help_text": "This flag specifies whether the taxonomic lineage should be included in the output `.tsv` file. The taxonomic lineage is obtained from the internal module of `mmseqs/taxonomy` that infers the last common ancestor to classify the taxonomy. A value of '0' writes no taxonomic lineage, a value of '1' adds a column with the full lineage names prefixed with abbreviation of the lineage level, e.g., k_Prokaryotes;p_Bacteroidetes;c_....;o_....;f_....;g_....;s_...., while a value of '2' adds a column with the full NCBI taxids lineage,e.g., 1324;2345;4546;5345. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--tax-lineage`\".", "description": "Specify whether to include or remove the taxonomic lineage.", "fa_icon": "fab fa-audible" }, - "taxonomy_mmseqs_taxonomy_sensitivity": { + "mmseqs_taxonomy_sensitivity": { "type": "string", "default": "5.0", "help_text": "This flag specifies the speed and sensitivity of the taxonomic search. It stands for how many kmers should be produced during the preliminary seeding stage. A very fast search requires a low value e.g., '1.0' and a a very sensitive search requires e.g., '7.0'. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--s`\".", "description": "Specify the speed and sensitivity for taxonomy assignment.", "fa_icon": "fas fa-history" }, - "taxonomy_mmseqs_taxonomy_orffilters": { + "mmseqs_taxonomy_orffilters": { "type": "string", "default": "2.0", - "help_text": "This flag specifies the sensitivity used for prefiltering the query ORF. Before the taxonomy assigning step, mmseqs2 searches the predicted ORFs against the database provided. This value specifies the speed with which the search is carried out. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--orf-filter-s`\".", - "description": "Specify the ORF search speed in the prefilter step.", + "help_text": "This flag specifies the sensitivity used for prefiltering the query ORF. Before the taxonomy-assigning step, MMseqs2 searches the predicted ORFs against the provided database. This value influences the speed with which the search is carried out. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--orf-filter-s`\".", + "description": "Specify the ORF search sensitivity in the prefilter step.", "fa_icon": "fas fa-clock" }, - "taxonomy_mmseqs_taxonomy_lcamode": { + "mmseqs_taxonomy_lcamode": { "type": "integer", "default": 3, - "help_text": "This flag specifies the strategy used for assigning the least common ancestor (LCA). MMseqs2 assigns taxonomy based on an accelerated approximation of the 2bLCA protocol and uses the value of '3'. In this mode the taxonomic assignment is based not only on usual alignment parameters but also considers the taxonomic classification of the LCA. When the value '4' is used the LCA is assigned based on all the equal scoring top hits. If the value '1' is used the LCA assignment is disregarded and the taxonomic assignment is based on usual alignment parameters like evalue and coverage. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--lca-mode`\".", + "help_text": "This flag specifies the strategy used for assigning the last common ancestor (LCA). MMseqs2 assigns taxonomy based on an accelerated approximation of the 2bLCA protocol and uses the value of '3'. In this mode, the taxonomic assignment is based not only on usual alignment parameters but also considers the taxonomic classification of the LCA. When the value '4' is used the LCA is assigned based on all the equal scoring top hits. If the value '1' is used the LCA assignment is disregarded and the taxonomic assignment is based on usual alignment parameters like evalue and coverage. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--lca-mode`\".", "description": "Specify the mode to assign the taxonomy.", "fa_icon": "fas fa-broom" }, - "taxonomy_mmseqs_taxonomy_votemode": { + "mmseqs_taxonomy_votemode": { "type": "integer", "default": 1, "help_text": "This flag assigns the mode value with which the weights are computed. The value of '0' stands for uniform weights of taxonomy assignments, the value of '1' uses the minus log E-value and '2' the actual score. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--vote-mode`\".", @@ -372,7 +373,7 @@ "default": "Bacteria", "fa_icon": "fab fa-accusoft", "description": "Specify the kingdom that the input represents.", - "help_text": "Specifies the kingdom that the input sample is derived from and/or you wish to screen for\n\n> ⚠️ Prokka cannot annotate Eukaryotes.\n\nFor more information please check Prokka [documentation](https://github.com/tseemann/prokka).\n\n> Modifies tool parameter(s):\n> - Prokka: `--kingdom`", + "help_text": "Specifies the kingdom that the input sample is derived from and/or you wish to screen for\n\n> \u26a0\ufe0f Prokka cannot annotate Eukaryotes.\n\nFor more information please check Prokka [documentation](https://github.com/tseemann/prokka).\n\n> Modifies tool parameter(s):\n> - Prokka: `--kingdom`", "enum": ["Archaea", "Bacteria", "Mitochondria", "Viruses"] }, "annotation_prokka_gcode": { @@ -393,7 +394,7 @@ }, "annotation_prokka_evalue": { "type": "number", - "default": 0.000001, + "default": 1e-6, "description": "Minimum e-value cut-off.", "help_text": "Specifiy the minimum e-value used for filtering the alignment hits.\n\nFor more information please check Prokka [documentation](https://github.com/tseemann/prokka).\n\n> Modifies tool parameter(s):\n> - Prokka: `--evalue`", "fa_icon": "fas fa-sort-amount-down" @@ -641,7 +642,7 @@ }, "amp_ampcombi_cutoff": { "type": "number", - "default": 0.0, + "default": 0, "description": "Specify probability cutoff to filter AMPs", "help_text": "Specify the minimum probability an AMP hit must have to be retained in the final output file. Anything below this threshold will be removed.\n\nFor more information check AMPcombi [documentation](https://github.com/Darcy220606/AMPcombi).\n\n> Modifies tool parameter(s):\n> - AMPCOMBI: `--cutoff`", "fa_icon": "fas fa-sort-amount-up" @@ -670,7 +671,7 @@ }, "arg_amrfinderplus_identmin": { "type": "number", - "default": -1.0, + "default": -1, "help_text": "Specify the minimum percentage amino-acid identity to reference protein or nucleotide identity for nucleotide reference must have if a BLAST alignment (based on methods: BLAST or PARTIAL) was detected, otherwise NA.\n\n If you specify `-1`, this means use a curated threshold if it exists and `0.9` otherwise.\n\nSetting this value to something other than `-1` will override any curated similarity cutoffs. For BLAST: alignment is > 90% of length and > 90% identity to a protein in the AMRFinderPlus database. For PARTIAL: alignment is > 50% of length, but < 90% of length and > 90% identity to the reference, and does not end at a contig boundary.\n\nFor more information check AMRFinderPlus [documentation](https://github.com/ncbi/amr/wiki/Running-AMRFinderPlus#--organism-option).\n\n> Modifies tool parameter(s):\n> - AMRFinderPlus: `--ident_min`", "description": "Minimum percent identity to reference sequence.", "fa_icon": "fas fa-angle-left" @@ -978,7 +979,7 @@ "default": 1000, "description": "Minimum longest-contig length a sample must have to be screened with antiSMASH.", "fa_icon": "fas fa-ruler-horizontal", - "help_text": "This specifies the minimum length that the longest contig must have for the entire sample to be screened by antiSMASH.\n\nAny samples that do not reach this length will be not be sent to antiSMASH, therefore you will not receive output for these samples in your `--outdir`.\n\n> ⚠️ This is not the same as `--bgc_antismash_contigminlength`, which specifies to only analyse contigs above that threshold but _within_ a sample that has already passed `--bgc_antismash_sampleminlength` sample filter!" + "help_text": "This specifies the minimum length that the longest contig must have for the entire sample to be screened by antiSMASH.\n\nAny samples that do not reach this length will be not be sent to antiSMASH, therefore you will not receive output for these samples in your `--outdir`.\n\n> \u26a0\ufe0f This is not the same as `--bgc_antismash_contigminlength`, which specifies to only analyse contigs above that threshold but _within_ a sample that has already passed `--bgc_antismash_sampleminlength` sample filter!" }, "bgc_antismash_contigminlength": { "type": "integer", @@ -1146,7 +1147,7 @@ "type": "number", "description": "The p-value cutoff for protein domains to be included.", "fa_icon": "fas fa-filter", - "default": 0.000000001, + "default": 1e-9, "help_text": "The p-value cutoff for protein domains to be included.\n\nFor more information see the GECCO [documentation](https://github.com/zellerlab/GECCO).\n\n> Modifies tool parameter(s):\n> - GECCO: `--pfilter`" }, "bgc_gecco_threshold": { @@ -1424,13 +1425,13 @@ "$ref": "#/definitions/screening_type_activation" }, { - "$ref": "#/definitions/taxonomy" + "$ref": "#/definitions/taxonomic_classification" }, { - "$ref": "#/definitions/taxonomy_mmseqs_databases" + "$ref": "#/definitions/taxonomic_classification_mmseqs2_databases" }, { - "$ref": "#/definitions/taxonomy_mmseqs2_taxonomy" + "$ref": "#/definitions/taxonomic_classification_mmseqs2_taxonomy" }, { "$ref": "#/definitions/annotation" diff --git a/subworkflows/local/amp.nf b/subworkflows/local/amp.nf index 844826a4..a770d459 100644 --- a/subworkflows/local/amp.nf +++ b/subworkflows/local/amp.nf @@ -9,14 +9,14 @@ include { AMPIR } from '.. include { DRAMP_DOWNLOAD } from '../../modules/local/dramp_download' include { AMPCOMBI } from '../../modules/nf-core/ampcombi/main' include { GUNZIP as GUNZIP_MACREL_PRED ; GUNZIP as GUNZIP_MACREL_ORFS } from '../../modules/nf-core/gunzip/main' -include { TABIX_BGZIP } from '../../modules/nf-core/tabix/bgzip/main' -include { MERGE_TAXONOMY_AMPCOMBI } from '../../modules/local/merge_taxonomy_ampcombi' +include { TABIX_BGZIP as AMP_TABIX_BGZIP } from '../../modules/nf-core/tabix/bgzip/main' +include { MERGE_TAXONOMY_AMPCOMBI } from '../../modules/local/merge_taxonomy_ampcombi' workflow AMP { take: contigs // tuple val(meta), path(contigs) faa // tuple val(meta), path(PROKKA/PRODIGAL.out.faa) - tsv // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) + tsv // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) main: ch_versions = Channel.empty() @@ -106,20 +106,26 @@ workflow AMP { AMPCOMBI( ch_input_for_ampcombi.input, ch_input_for_ampcombi.faa, ch_ampcombi_input_db ) ch_versions = ch_versions.mix(AMPCOMBI.out.versions) - //AMPCOMBI concatenation - ch_ampcombi_summaries = AMPCOMBI.out.csv.map{ it[1] }.collectFile(name: 'ampcombi_complete_summary.tsv', keepHeader:true) + if ( !params.run_taxonomic_classification ) { + ch_ampcombi_summaries = AMPCOMBI.out.csv.map{ it[1] }.collectFile(name: 'ampcombi_complete_summary.tsv', storeDir: "${params.outdir}/reports/ampcombi",keepHeader:true) + } else { + ch_ampcombi_summaries = AMPCOMBI.out.csv.map{ it[1] }.collectFile(name: 'ampcombi_complete_summary.tsv',keepHeader:true) + } // MERGE_TAXONOMY - ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() - MERGE_TAXONOMY_AMPCOMBI(ch_ampcombi_summaries, ch_mmseqs_taxonomy_list) - ch_versions = ch_versions.mix(MERGE_TAXONOMY_AMPCOMBI.out.versions) + if ( params.run_taxonomic_classification ) { - ch_tabix_input = Channel.of(['id':'ampcombi_complete_summary_taxonomy']) - .combine(MERGE_TAXONOMY_AMPCOMBI.out.tsv) + ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() + MERGE_TAXONOMY_AMPCOMBI(ch_ampcombi_summaries, ch_mmseqs_taxonomy_list) + ch_versions = ch_versions.mix(MERGE_TAXONOMY_AMPCOMBI.out.versions) - TABIX_BGZIP(ch_tabix_input) - ch_versions = ch_versions.mix(TABIX_BGZIP.out.versions) + ch_tabix_input = Channel.of(['id':'ampcombi_complete_summary_taxonomy']) + .combine(MERGE_TAXONOMY_AMPCOMBI.out.tsv) + + AMP_TABIX_BGZIP(ch_tabix_input) + ch_versions = ch_versions.mix(AMP_TABIX_BGZIP.out.versions) + } emit: versions = ch_versions diff --git a/subworkflows/local/arg.nf b/subworkflows/local/arg.nf index 30bfee64..a8418fa3 100644 --- a/subworkflows/local/arg.nf +++ b/subworkflows/local/arg.nf @@ -2,24 +2,25 @@ Run ARG screening tools */ -include { ABRICATE_RUN } from '../../modules/nf-core/abricate/run/main' -include { AMRFINDERPLUS_UPDATE } from '../../modules/nf-core/amrfinderplus/update/main' -include { AMRFINDERPLUS_RUN } from '../../modules/nf-core/amrfinderplus/run/main' -include { FARGENE } from '../../modules/nf-core/fargene/main' -include { DEEPARG_DOWNLOADDATA } from '../../modules/nf-core/deeparg/downloaddata/main' -include { DEEPARG_PREDICT } from '../../modules/nf-core/deeparg/predict/main' -include { RGI_MAIN } from '../../modules/nf-core/rgi/main/main' -include { HAMRONIZATION_ABRICATE } from '../../modules/nf-core/hamronization/abricate/main' -include { HAMRONIZATION_RGI } from '../../modules/nf-core/hamronization/rgi/main' -include { HAMRONIZATION_DEEPARG } from '../../modules/nf-core/hamronization/deeparg/main' -include { HAMRONIZATION_AMRFINDERPLUS } from '../../modules/nf-core/hamronization/amrfinderplus/main' -include { HAMRONIZATION_FARGENE } from '../../modules/nf-core/hamronization/fargene/main' -include { HAMRONIZATION_SUMMARIZE } from '../../modules/nf-core/hamronization/summarize/main' -include { MERGE_TAXONOMY_HAMRONIZATION } from '../../modules/local/merge_taxonomy_hamronization' +include { ABRICATE_RUN } from '../../modules/nf-core/abricate/run/main' +include { AMRFINDERPLUS_UPDATE } from '../../modules/nf-core/amrfinderplus/update/main' +include { AMRFINDERPLUS_RUN } from '../../modules/nf-core/amrfinderplus/run/main' +include { FARGENE } from '../../modules/nf-core/fargene/main' +include { DEEPARG_DOWNLOADDATA } from '../../modules/nf-core/deeparg/downloaddata/main' +include { DEEPARG_PREDICT } from '../../modules/nf-core/deeparg/predict/main' +include { RGI_MAIN } from '../../modules/nf-core/rgi/main/main' +include { HAMRONIZATION_ABRICATE } from '../../modules/nf-core/hamronization/abricate/main' +include { HAMRONIZATION_RGI } from '../../modules/nf-core/hamronization/rgi/main' +include { HAMRONIZATION_DEEPARG } from '../../modules/nf-core/hamronization/deeparg/main' +include { HAMRONIZATION_AMRFINDERPLUS } from '../../modules/nf-core/hamronization/amrfinderplus/main' +include { HAMRONIZATION_FARGENE } from '../../modules/nf-core/hamronization/fargene/main' +include { HAMRONIZATION_SUMMARIZE } from '../../modules/nf-core/hamronization/summarize/main' +include { TABIX_BGZIP as ARG_TABIX_BGZIP } from '../../modules/nf-core/tabix/bgzip/main' +include { MERGE_TAXONOMY_HAMRONIZATION } from '../../modules/local/merge_taxonomy_hamronization' workflow ARG { take: - contigs // tuple val(meta), path(contigs) + contigs // tuple val(meta), path(contigs) annotations // output from prokka tsv // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) @@ -148,9 +149,18 @@ workflow ARG { ch_versions = ch_versions.mix(HAMRONIZATION_SUMMARIZE.out.versions) // MERGE_TAXONOMY - ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() - MERGE_TAXONOMY_HAMRONIZATION(HAMRONIZATION_SUMMARIZE.out.tsv, ch_mmseqs_taxonomy_list) - ch_versions = ch_versions.mix(MERGE_TAXONOMY_HAMRONIZATION.out.versions) + if ( params.run_taxonomic_classification ) { + + ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() + MERGE_TAXONOMY_HAMRONIZATION(HAMRONIZATION_SUMMARIZE.out.tsv, ch_mmseqs_taxonomy_list) + ch_versions = ch_versions.mix(MERGE_TAXONOMY_HAMRONIZATION.out.versions) + + ch_tabix_input = Channel.of(['id':'hamronization_combined_report']) + .combine(MERGE_TAXONOMY_HAMRONIZATION.out.tsv) + + ARG_TABIX_BGZIP(ch_tabix_input) + ch_versions = ch_versions.mix(ARG_TABIX_BGZIP.out.versions) + } emit: versions = ch_versions diff --git a/subworkflows/local/bgc.nf b/subworkflows/local/bgc.nf index c0d9a164..0420e09c 100644 --- a/subworkflows/local/bgc.nf +++ b/subworkflows/local/bgc.nf @@ -12,6 +12,7 @@ include { HMMER_HMMSEARCH as BGC_HMMER_HMMSEARCH } from '../../modules/nf-core include { DEEPBGC_DOWNLOAD } from '../../modules/nf-core/deepbgc/download/main' include { DEEPBGC_PIPELINE } from '../../modules/nf-core/deepbgc/pipeline/main' include { COMBGC } from '../../modules/local/combgc' +include { TABIX_BGZIP as BGC_TABIX_BGZIP } from '../../modules/nf-core/tabix/bgzip/main' include { MERGE_TAXONOMY_COMBGC } from '../../modules/local/merge_taxonomy_combgc' workflow BGC { @@ -187,12 +188,26 @@ workflow BGC { ch_versions = ch_versions.mix(COMBGC.out.versions) // COMBGC concatenation - ch_combgc_summaries = COMBGC.out.tsv.map{ it[1] }.collectFile(name: 'combgc_complete_summary.tsv', storeDir: "${params.outdir}/reports/combgc", keepHeader:true) + if ( !params.run_taxonomic_classification ) { + ch_combgc_summaries = COMBGC.out.tsv.map{ it[1] }.collectFile(name: 'combgc_complete_summary.tsv', storeDir: "${params.outdir}/reports/combgc", keepHeader:true) + } else { + ch_combgc_summaries = COMBGC.out.tsv.map{ it[1] }.collectFile(name: 'combgc_complete_summary.tsv', keepHeader:true) + } // MERGE_TAXONOMY - ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() - MERGE_TAXONOMY_COMBGC(ch_combgc_summaries, ch_mmseqs_taxonomy_list) - ch_versions = ch_versions.mix(MERGE_TAXONOMY_COMBGC.out.versions) + if ( params.run_taxonomic_classification ) { + + ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() + MERGE_TAXONOMY_COMBGC(ch_combgc_summaries, ch_mmseqs_taxonomy_list) + ch_versions = ch_versions.mix(MERGE_TAXONOMY_COMBGC.out.versions) + + ch_tabix_input = Channel.of(['id':'combgc_complete_summary_taxonomy']) + .combine(MERGE_TAXONOMY_COMBGC.out.tsv) + + BGC_TABIX_BGZIP(ch_tabix_input) + ch_versions = ch_versions.mix(BGC_TABIX_BGZIP.out.versions) + + } emit: versions = ch_versions diff --git a/subworkflows/local/taxa.nf b/subworkflows/local/taxa.nf new file mode 100644 index 00000000..5a5f94c2 --- /dev/null +++ b/subworkflows/local/taxa.nf @@ -0,0 +1,55 @@ +/* + TAXONOMIC CLASSIFICATION +*/ + +include { MMSEQS_CREATEDB } from '../../modules/nf-core/mmseqs/createdb/main' +include { MMSEQS_DATABASES } from '../../modules/nf-core/mmseqs/databases/main' +include { MMSEQS_TAXONOMY } from '../../modules/nf-core/mmseqs/taxonomy/main' +include { MMSEQS_CREATETSV } from '../../modules/nf-core/mmseqs/createtsv/main' + +workflow TAXA { + take: + contigs // tuple val(meta), path(contigs) + + main: + ch_versions = Channel.empty() + ch_mmseqs_db = Channel.empty() + ch_taxonomy_querydb = Channel.empty() + ch_taxonomy_querydb_taxdb = Channel.empty() + ch_taxonomy_tsv = Channel.empty() + + if ( params.taxonomic_classification_tool == 'mmseqs2') { + + // Download the ref db if not supplied by user + // MMSEQS_DATABASE + if ( params.mmseqs_databases_localpath != null ) { + ch_mmseqs_db = Channel + .fromPath( params.mmseqs_databases_localpath ) + .first() + } else { + MMSEQS_DATABASES ( params.mmseqs_databases_id ) + ch_versions = ch_versions.mix( MMSEQS_DATABASES.out.versions ) + ch_mmseqs_db = ( MMSEQS_DATABASES.out.database ) + } + + // Create db for query contigs, assign taxonomy and convert to table format + // MMSEQS_CREATEDB + MMSEQS_CREATEDB ( contigs ) + ch_versions = ch_versions.mix(MMSEQS_CREATEDB.out.versions) + ch_taxonomy_querydb = MMSEQS_CREATEDB.out.db + + // MMSEQS_TAXONOMY + MMSEQS_TAXONOMY ( ch_taxonomy_querydb, ch_mmseqs_db ) + ch_versions = ch_versions.mix(MMSEQS_TAXONOMY.out.versions) + ch_taxonomy_querydb_taxdb = MMSEQS_TAXONOMY.out.db_taxonomy + + // MMSEQS_CREATETSV + MMSEQS_CREATETSV ( ch_taxonomy_querydb_taxdb, [[:],[]], ch_taxonomy_querydb ) + ch_versions = ch_versions.mix(MMSEQS_CREATETSV.out.versions) + ch_taxonomy_tsv = MMSEQS_CREATETSV.out.tsv + } + + emit: + versions = ch_versions + sample_taxonomy = ch_taxonomy_tsv //channel: [ val(meta), tsv ] + } diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 6b0cf853..538a57c0 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -29,9 +29,10 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { AMP } from '../subworkflows/local/amp' -include { ARG } from '../subworkflows/local/arg' -include { BGC } from '../subworkflows/local/bgc' +include { AMP } from '../subworkflows/local/amp' +include { ARG } from '../subworkflows/local/arg' +include { BGC } from '../subworkflows/local/bgc' +include { TAXA } from '../subworkflows/local/taxa' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -57,10 +58,6 @@ include { PRODIGAL as PRODIGAL_GBK } from '../modules/nf-core/prodigal/mai include { PYRODIGAL } from '../modules/nf-core/pyrodigal/main' include { BAKTA_BAKTADBDOWNLOAD } from '../modules/nf-core/bakta/baktadbdownload/main' include { BAKTA_BAKTA } from '../modules/nf-core/bakta/bakta/main' -include { MMSEQS_CREATEDB } from '../modules/nf-core/mmseqs/createdb/main' -include { MMSEQS_DATABASES } from '../modules/nf-core/mmseqs/databases/main' -include { MMSEQS_TAXONOMY } from '../modules/nf-core/mmseqs/taxonomy/main' -include { MMSEQS_CREATETSV } from '../modules/nf-core/mmseqs/createtsv/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -111,42 +108,21 @@ workflow FUNCSCAN { /* TAXONOMIC CLASSIFICATION */ - // The final subworkflow reports need taxonomic classification - // This can be either on NT or AA level depending on annotation - // NOTE: (AA tax. classification will be added only when its PR is merged - NOW - only on NT) - if ( params.taxonomy_mmseqs_classification_off == false ) { - - // Download the ref db if not supplied by user - if ( params.taxonomy_mmseqs_databases_localpath != null ) { - ch_mmseqs_db = Channel - .fromPath( params.taxonomy_mmseqs_databases_localpath ) - .first() - } else { - MMSEQS_DATABASES ( params.taxonomy_mmseqs_databases_id ) - ch_versions = ch_versions.mix( MMSEQS_DATABASES.out.versions ) - ch_mmseqs_db = ( MMSEQS_DATABASES.out.database ) - } - - // Create db for query contigs, assign taxonomy and convert to table format - MMSEQS_CREATEDB ( ch_prepped_input ) - ch_versions = ch_versions.mix(MMSEQS_CREATEDB.out.versions) - ch_taxonomy_querydb = MMSEQS_CREATEDB.out.db - MMSEQS_TAXONOMY ( ch_taxonomy_querydb, ch_mmseqs_db ) - ch_versions = ch_versions.mix(MMSEQS_TAXONOMY.out.versions) - ch_taxonomy_querydb_taxdb = MMSEQS_TAXONOMY.out.db_taxonomy - // MMSEQS_CREATETSV ( ch_taxonomy_querydb_taxdb, [[:],[]], ch_taxonomy_querydb ) - MMSEQS_CREATETSV ( ch_taxonomy_querydb_taxdb, [[:],[]], ch_taxonomy_querydb ) - ch_versions = ch_versions.mix(MMSEQS_CREATETSV.out.versions) - ch_taxonomy_tsv = MMSEQS_CREATETSV.out.tsv - - } else { + // The final subworkflow reports need taxonomic classification. + // This can be either on NT or AA level depending on annotation. + // TODO: Only NT at the moment. AA tax. classification will be added only when its PR is merged. + if ( params.run_taxonomic_classification ) { + TAXA ( ch_prepped_input ) + ch_versions = ch_versions.mix(TAXA.out.versions) + ch_taxonomy_tsv = TAXA.out.sample_taxonomy - ch_mmseqs_db = Channel.empty() - ch_taxonomy_querydb = Channel.empty() - ch_taxonomy_querydb_taxdb = Channel.empty() - ch_taxonomy_tsv = Channel.empty() + } else { + ch_mmseqs_db = Channel.empty() + ch_taxonomy_querydb = Channel.empty() + ch_taxonomy_querydb_taxdb = Channel.empty() + ch_taxonomy_tsv = Channel.empty() } /* @@ -227,7 +203,7 @@ workflow FUNCSCAN { /* AMPs */ - if ( params.run_amp_screening ) { + if ( params.run_amp_screening && !params.run_taxonomic_classification ) { AMP ( ch_prepped_input, ch_annotation_faa @@ -235,13 +211,26 @@ workflow FUNCSCAN { meta, file -> if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() + }, ch_taxonomy_tsv + ) + ch_versions = ch_versions.mix(AMP.out.versions) + } else if ( params.run_amp_screening && params.run_taxonomic_classification ) { + AMP ( + ch_prepped_input, + ch_annotation_faa + .filter { + meta, file -> + if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") + !file.isEmpty() + }, + ch_taxonomy_tsv .filter { meta, file -> if ( file.isEmpty() ) log.warn("Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") !file.isEmpty() - } + } ) ch_versions = ch_versions.mix(AMP.out.versions) } @@ -249,17 +238,38 @@ workflow FUNCSCAN { /* ARGs */ - if ( params.run_arg_screening ) { + if ( params.run_arg_screening && !params.run_taxonomic_classification ) { if (params.arg_skip_deeparg) { - ARG ( ch_prepped_input, + ARG ( + ch_prepped_input, + [], + ch_taxonomy_tsv + ) + } else { + ARG ( + ch_prepped_input, + ch_annotation_faa + .filter { + meta, file -> + if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") + !file.isEmpty() + }, + ch_taxonomy_tsv + ) + } + ch_versions = ch_versions.mix(ARG.out.versions) + } else if ( params.run_arg_screening && params.run_taxonomic_classification ) { + if (params.arg_skip_deeparg) { + ARG ( + ch_prepped_input, [], ch_taxonomy_tsv .filter { meta, file -> if ( file.isEmpty() ) log.warn("Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") !file.isEmpty() - } - ) + } + ) } else { ARG ( ch_prepped_input, @@ -283,7 +293,7 @@ workflow FUNCSCAN { /* BGCs */ - if ( params.run_bgc_screening ) { + if ( params.run_bgc_screening && !params.run_taxonomic_classification ) { BGC ( ch_prepped_input, ch_annotation_gff @@ -305,7 +315,31 @@ workflow FUNCSCAN { !file.isEmpty() }, ch_taxonomy_tsv + ) + ch_versions = ch_versions.mix(BGC.out.versions) + } else if ( params.run_bgc_screening && params.run_taxonomic_classification ) { + BGC ( + ch_prepped_input, + ch_annotation_gff + .filter { + meta, file -> + if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty GFF file. AMP screening tools requiring this file will not be executed: ${meta.id}") + !file.isEmpty() + }, + ch_annotation_faa + .filter { + meta, file -> + if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") + !file.isEmpty() + }, + ch_annotation_gbk .filter { + meta, file -> + if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty GBK file. AMP screening tools requiring this file will not be executed: ${meta.id}") + !file.isEmpty() + }, + ch_taxonomy_tsv + .filter { meta, file -> if ( file.isEmpty() ) log.warn("Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") !file.isEmpty() @@ -313,6 +347,7 @@ workflow FUNCSCAN { ) ch_versions = ch_versions.mix(BGC.out.versions) } + // // Collate and save software versions // From 265e1c65644b4de32a6e8886468606185c12f7ed Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Sat, 23 Mar 2024 12:56:10 +0100 Subject: [PATCH 29/39] lint taxa.nf --- subworkflows/local/taxa.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/taxa.nf b/subworkflows/local/taxa.nf index 5a5f94c2..d35e0e92 100644 --- a/subworkflows/local/taxa.nf +++ b/subworkflows/local/taxa.nf @@ -52,4 +52,4 @@ workflow TAXA { emit: versions = ch_versions sample_taxonomy = ch_taxonomy_tsv //channel: [ val(meta), tsv ] - } +} From 2bd8d56e119ee4a1c8c6a9a8c4b22f24d3f706e7 Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Sat, 23 Mar 2024 13:14:00 +0100 Subject: [PATCH 30/39] add test_taxonomy in nextflow.config --- nextflow.config | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/nextflow.config b/nextflow.config index c798b050..6ac9c0a9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -339,11 +339,12 @@ profiles { executor.cpus = 4 executor.memory = 8.GB } - test { includeConfig 'conf/test.config' } - test_bgc { includeConfig 'conf/test_bgc.config' } - test_full { includeConfig 'conf/test_full.config' } - test_deeparg { includeConfig 'conf/test_deeparg.config' } - test_nothing { includeConfig 'conf/test_nothing.config' } + test { includeConfig 'conf/test.config' } + test_bgc { includeConfig 'conf/test_bgc.config' } + test_taxonomy { includeConfig 'conf/test_taxonomy.config' } + test_full { includeConfig 'conf/test_full.config' } + test_deeparg { includeConfig 'conf/test_deeparg.config' } + test_nothing { includeConfig 'conf/test_nothing.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile From 19119c9b23ad87c70b47bc4a4c69b6bbb1698822 Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Mon, 25 Mar 2024 13:49:43 +0100 Subject: [PATCH 31/39] add James suggestions --- conf/test_nothing.config | 2 +- docs/output.md | 12 ++++++++---- docs/usage.md | 18 +++++++++++++++--- subworkflows/local/amp.nf | 4 ++-- subworkflows/local/{taxa.nf => taxa_class.nf} | 2 +- workflows/funcscan.nf | 14 +++++++------- 6 files changed, 34 insertions(+), 18 deletions(-) rename subworkflows/local/{taxa.nf => taxa_class.nf} (98%) diff --git a/conf/test_nothing.config b/conf/test_nothing.config index 2509be4a..34fdd49a 100644 --- a/conf/test_nothing.config +++ b/conf/test_nothing.config @@ -18,7 +18,7 @@ params { // Limit resources so that this can run on GitHub Actions max_cpus = 2 - max_memory = '9.GB' + max_memory = '8.GB' max_time = '6.h' // Input data diff --git a/docs/output.md b/docs/output.md index 83d58ec2..527e5604 100644 --- a/docs/output.md +++ b/docs/output.md @@ -10,7 +10,7 @@ The output of nf-core/funcscan provides reports for each of the functional group As a general workflow, we recommend to first look at the summary reports ([ARGs](#hamronization), [AMPs](#ampcombi), [BGCs](#combgc)), to get a general overview of what hits have been found across all the tools of each functional group. After which, you can explore the specific output directories of each tool to get more detailed information about each result. The tool-specific output directories also includes the output from the functional annotation steps of either [prokka](https://github.com/tseemann/prokka), [pyrodigal](https://github.com/althonos/pyrodigal), [prodigal](https://github.com/hyattpd/Prodigal), or [Bakta](https://github.com/oschwengers/bakta) if the `--save_annotations` flag was set. Additionally, taxonomic classifications from [MMseqs2](https://github.com/soedinglab/MMseqs2) are saved if the `taxonomy_mmseqs_save_intermedfiles` flag was set. -Similarly, all downloaded databases are saved (i.e. from [MMseqs2](https://github.com/soedinglab/MMseqs2), [antiSMASH](https://docs.antismash.secondarymetabolites.org), [AMRFinderPlus](https://www.ncbi.nlm.nih.gov/pathogens/antimicrobial-resistance/AMRFinder), [Bakta](https://github.com/oschwengers/bakta), [DeepARG](https://bitbucket.org/gusphdproj/deeparg-ss/src/master), and/or [AMPcombi](https://github.com/Darcy220606/AMPcombi)) into the output directory `/downloads/` if the `--save_databases` flag was set. +Similarly, all downloaded databases are saved (i.e. from [MMseqs2](https://github.com/soedinglab/MMseqs2), [antiSMASH](https://docs.antismash.secondarymetabolites.org), [AMRFinderPlus](https://www.ncbi.nlm.nih.gov/pathogens/antimicrobial-resistance/AMRFinder), [Bakta](https://github.com/oschwengers/bakta), [DeepARG](https://bitbucket.org/gusphdproj/deeparg-ss/src/master), and/or [AMPcombi](https://github.com/Darcy220606/AMPcombi)) into the output directory `/databases/` if the `--save_databases` flag was set. Furthermore, for reproducibility, versions of all software used in the run is presented in a [MultiQC](http://multiqc.info) report. @@ -106,14 +106,18 @@ Output Summaries: - `taxonomic_classification/mmseqs2_createtsv/` - `/`: - - `*.tsv`: tab seperated table containing the taxonomic lineage of every contig when available. -- `reports//_complete_summary_taxonomy.tsv.gz`: tab seperated table containing the concatenated results from the summary tables along with the taxonomic classification if the parameter `run_taxonomic_classification` is called. + - `*.tsv`: tab separated table containing the taxonomic lineage of every contig only when available in the database. When a contig cannot be classified according to the database, it is assigned in the 'lineage' column as 'no ranK | unclassified'. +- `reports//_complete_summary_taxonomy.tsv.gz`: tab separated table containing the concatenated results from the summary tables along with the taxonomic classification if the parameter `run_taxonomic_classification` is called. [MMseqs2](https://github.com/soedinglab/MMseqs2) classifies the taxonomic lineage of contigs based on the last common ancestor. The inferred taxonomic lineages are included in the final workflow summaries to annotate the potential source bacteria of the identified AMPs, ARGs, and/or BGCs. +### Annotation tools + [Pyrodigal](#pyrodigal), [Prodigal](#prodigal), [Prokka](#prokka), [Bakta](#bakta) +### Annotation tools + #### Prodigal
@@ -420,7 +424,7 @@ Output Summaries: Output files - `ampcombi/` - - `ampcombi_complete_summary.tsv`: tab seperated table containing the concatenated results from the ampcombi summary tables. This is the output given when the taxonomic classification is not activated, i.e., pipeline default. + - `ampcombi_complete_summary.tsv`: tab separated table containing the concatenated results from the ampcombi summary tables. This is the output given when the taxonomic classification is not activated, i.e., pipeline default. - `ampcombi_complete_summary_taxonomy.tsv.gz`: summarised output from all AMP workflow tools with taxonomic assignment in compressed tsv format. - `ampcombi.log`: a log file generated by ampcombi - `/*_ampcombi.csv`: summarised output in csv for each sample diff --git a/docs/usage.md b/docs/usage.md index 7f364405..eb204e89 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,7 +6,7 @@ ## Introduction -nf-core/funcscan is a pipeline for efficient and parallelised screening of long nucleotide sequences such as contigs for antimicrobial peptide genes, antimicrobial resistance genes, and biosynthetic gene clusters. It further identifies their taxonomic orgin. +nf-core/funcscan is a pipeline for efficient and parallelised screening of long nucleotide sequences such as contigs for antimicrobial peptide genes, antimicrobial resistance genes, and biosynthetic gene clusters. It can additionally identify the taxonomic origin of the sequences. ## Running the pipeline @@ -18,7 +18,7 @@ nextflow run nf-core/funcscan --input samplesheet.csv --outdir -profile This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. -To run any of the three screening workflows (AMP, ARG, and/or BGC) or taxonomic classification (currently done with [MMseqs2](https://github.com/soedinglab/MMseqs2)), switch them on by adding the respective flag(s) to the command: +To run any of the three screening workflows (AMP, ARG, and/or BGC) or taxonomic classification, switch them on by adding the respective flag(s) to the command: - `--run_amp_screening` - `--run_arg_screening` @@ -69,10 +69,22 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p > ⚠️ We highly recommend performing quality control on input contigs before running the pipeline. You may not receive results for some tools if none of the contigs in a FASTA file reach certain thresholds. Check parameter documentation for relevant minimum contig parameters. -## Notes on screening tools +## Notes on screening tools and taxonomic classification The implementation of some tools in the pipeline may have some particular behaviours that you should be aware of before you run the pipeline. +### MMseqs2 + +MMseqs2 is currently the only taxonomic classification tool used in the pipeline to assign a taxonomic lineage to the input contigs. The database used to assign the taxonomic lineage can either be: + - a custom based database created by the user using `mmseqs createdb` externally and beforehand. If this flag is assigned, this database takes precedence over the default database in ` mmseqs_databases_id`. + ``` + mmseqs_databases_localpath 'path/to/mmsesqs_custom_database/dir' + ``` + - an MMseqs2 ready database. These databases were compiled by the developers of MMseqs2 and can be called using their labels. All available options can be found [here](https://github.com/soedinglab/MMseqs2/wiki#downloading-databases). Only use those databases that have taxonomy files available (i.e., Taxonomy == Yes). By default mmseqs2 in the pipeline uses 'Kalamari' and runs an aminoacid based alignment. + ``` + mmseqs_databases_id 'Kalamari' + ``` + ### antiSMASH antiSMASH has a minimum contig parameter, in which only contigs of a certain length (or longer) will be screened. In cases where no hits are found in these, the tool ends successfully without hits. However if no contigs in an input file reach that minimum threshold, the tool will end with a 'failure' code, and cause the pipeline to crash. diff --git a/subworkflows/local/amp.nf b/subworkflows/local/amp.nf index a770d459..04b6f88d 100644 --- a/subworkflows/local/amp.nf +++ b/subworkflows/local/amp.nf @@ -9,7 +9,7 @@ include { AMPIR } from '.. include { DRAMP_DOWNLOAD } from '../../modules/local/dramp_download' include { AMPCOMBI } from '../../modules/nf-core/ampcombi/main' include { GUNZIP as GUNZIP_MACREL_PRED ; GUNZIP as GUNZIP_MACREL_ORFS } from '../../modules/nf-core/gunzip/main' -include { TABIX_BGZIP as AMP_TABIX_BGZIP } from '../../modules/nf-core/tabix/bgzip/main' +include { TABIX_BGZIP as AMP_TABIX_BGZIP } from '../../modules/nf-core/tabix/bgzip/main' include { MERGE_TAXONOMY_AMPCOMBI } from '../../modules/local/merge_taxonomy_ampcombi' workflow AMP { @@ -110,7 +110,7 @@ workflow AMP { if ( !params.run_taxonomic_classification ) { ch_ampcombi_summaries = AMPCOMBI.out.csv.map{ it[1] }.collectFile(name: 'ampcombi_complete_summary.tsv', storeDir: "${params.outdir}/reports/ampcombi",keepHeader:true) } else { - ch_ampcombi_summaries = AMPCOMBI.out.csv.map{ it[1] }.collectFile(name: 'ampcombi_complete_summary.tsv',keepHeader:true) + ch_ampcombi_summaries = AMPCOMBI.out.csv.map{ it[1] }.collectFile(name: 'ampcombi_complete_summary.tsv', keepHeader:true) } // MERGE_TAXONOMY diff --git a/subworkflows/local/taxa.nf b/subworkflows/local/taxa_class.nf similarity index 98% rename from subworkflows/local/taxa.nf rename to subworkflows/local/taxa_class.nf index d35e0e92..86385668 100644 --- a/subworkflows/local/taxa.nf +++ b/subworkflows/local/taxa_class.nf @@ -7,7 +7,7 @@ include { MMSEQS_DATABASES } from '../../modules/nf-core/mmseqs/databases/main' include { MMSEQS_TAXONOMY } from '../../modules/nf-core/mmseqs/taxonomy/main' include { MMSEQS_CREATETSV } from '../../modules/nf-core/mmseqs/createtsv/main' -workflow TAXA { +workflow TAXA_CLASS { take: contigs // tuple val(meta), path(contigs) diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 538a57c0..089b15ee 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -29,10 +29,10 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { AMP } from '../subworkflows/local/amp' -include { ARG } from '../subworkflows/local/arg' -include { BGC } from '../subworkflows/local/bgc' -include { TAXA } from '../subworkflows/local/taxa' +include { AMP } from '../subworkflows/local/amp' +include { ARG } from '../subworkflows/local/arg' +include { BGC } from '../subworkflows/local/bgc' +include { TAXA_CLASS } from '../subworkflows/local/taxa_class' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -113,9 +113,9 @@ workflow FUNCSCAN { // This can be either on NT or AA level depending on annotation. // TODO: Only NT at the moment. AA tax. classification will be added only when its PR is merged. if ( params.run_taxonomic_classification ) { - TAXA ( ch_prepped_input ) - ch_versions = ch_versions.mix(TAXA.out.versions) - ch_taxonomy_tsv = TAXA.out.sample_taxonomy + TAXA_CLASS ( ch_prepped_input ) + ch_versions = ch_versions.mix(TAXA_CLASS.out.versions) + ch_taxonomy_tsv = TAXA_CLASS.out.sample_taxonomy } else { From d0152521c30901e98cf3a3cfb87c765396018698 Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Mon, 25 Mar 2024 13:54:55 +0100 Subject: [PATCH 32/39] prettier run --- docs/usage.md | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index eb204e89..0c4884c0 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -76,14 +76,18 @@ The implementation of some tools in the pipeline may have some particular behavi ### MMseqs2 MMseqs2 is currently the only taxonomic classification tool used in the pipeline to assign a taxonomic lineage to the input contigs. The database used to assign the taxonomic lineage can either be: - - a custom based database created by the user using `mmseqs createdb` externally and beforehand. If this flag is assigned, this database takes precedence over the default database in ` mmseqs_databases_id`. - ``` - mmseqs_databases_localpath 'path/to/mmsesqs_custom_database/dir' - ``` - - an MMseqs2 ready database. These databases were compiled by the developers of MMseqs2 and can be called using their labels. All available options can be found [here](https://github.com/soedinglab/MMseqs2/wiki#downloading-databases). Only use those databases that have taxonomy files available (i.e., Taxonomy == Yes). By default mmseqs2 in the pipeline uses 'Kalamari' and runs an aminoacid based alignment. - ``` - mmseqs_databases_id 'Kalamari' - ``` + +- a custom based database created by the user using `mmseqs createdb` externally and beforehand. If this flag is assigned, this database takes precedence over the default database in ` mmseqs_databases_id`. + +``` +mmseqs_databases_localpath 'path/to/mmsesqs_custom_database/dir' +``` + +- an MMseqs2 ready database. These databases were compiled by the developers of MMseqs2 and can be called using their labels. All available options can be found [here](https://github.com/soedinglab/MMseqs2/wiki#downloading-databases). Only use those databases that have taxonomy files available (i.e., Taxonomy == Yes). By default mmseqs2 in the pipeline uses 'Kalamari' and runs an aminoacid based alignment. + +``` +mmseqs_databases_id 'Kalamari' +``` ### antiSMASH From 117d7ebb0336ab62c7a929db066f1394018f6890 Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Fri, 29 Mar 2024 02:02:23 +0100 Subject: [PATCH 33/39] add reviewers suggestions --- bin/comBGC.py | 3 + bin/merge_taxonomy.py | 3 +- conf/modules.config | 22 +- conf/test_taxonomy.config | 2 +- docs/output.md | 4 +- docs/usage.md | 4 +- modules/local/merge_taxonomy_ampcombi.nf | 4 +- modules/local/merge_taxonomy_combgc.nf | 4 +- modules/local/merge_taxonomy_hamronization.nf | 4 +- nextflow.config | 318 +++++++++--------- nextflow_schema.json | 26 +- subworkflows/local/amp.nf | 4 +- subworkflows/local/taxa_class.nf | 8 +- workflows/funcscan.nf | 14 +- 14 files changed, 212 insertions(+), 208 deletions(-) diff --git a/bin/comBGC.py b/bin/comBGC.py index 12bcff01..3afd6aec 100755 --- a/bin/comBGC.py +++ b/bin/comBGC.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Written by Jasmin Frangenberg and released under the MIT license. +# See below for full license text. + from Bio import SeqIO import pandas as pd import argparse diff --git a/bin/merge_taxonomy.py b/bin/merge_taxonomy.py index f901ed91..14ea73a1 100755 --- a/bin/merge_taxonomy.py +++ b/bin/merge_taxonomy.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 -# Author: @darcy220606 +# Written by Anan Ibrahim and released under the MIT license. +# See git repository (https://github.com/Darcy220606/AMPcombi) for full license text. # Date: March 2024 # Version: 0.1.0 diff --git a/conf/modules.config b/conf/modules.config index 9503f3d0..014062d0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -52,7 +52,7 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] ext.args = [ - params.mmseqs_databases_savetmp ? "" : "--remove-tmp-files" , + params.taxa_classification_mmseqs_databases_savetmp ? "" : "--remove-tmp-files" , ].join(' ').trim() } @@ -73,22 +73,22 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] ext.args = [ - params.mmseqs_taxonomy_savetmp ? "" : "--remove-tmp-files", - "--search-type ${params.mmseqs_taxonomy_searchtype}", - "--lca-ranks ${params.mmseqs_taxonomy_lcaranks}", - "--tax-lineage ${params.mmseqs_taxonomy_taxlineage}", - "-s ${params.mmseqs_taxonomy_sensitivity}", - "--orf-filter-s ${params.mmseqs_taxonomy_orffilters}", - "--lca-mode ${params.mmseqs_taxonomy_lcamode}", - "--vote-mode ${params.mmseqs_taxonomy_votemode}", + params.taxa_classification_mmseqs_taxonomy_savetmp ? "" : "--remove-tmp-files", + "--search-type ${params.taxa_classification_mmseqs_taxonomy_searchtype}", + "--lca-ranks ${params.taxa_classification_mmseqs_taxonomy_lcaranks}", + "--tax-lineage ${params.taxa_classification_mmseqs_taxonomy_taxlineage}", + "-s ${params.taxa_classification_mmseqs_taxonomy_sensitivity}", + "--orf-filter-s ${params.taxa_classification_mmseqs_taxonomy_orffilters}", + "--lca-mode ${params.taxa_classification_mmseqs_taxonomy_lcamode}", + "--vote-mode ${params.taxa_classification_mmseqs_taxonomy_votemode}", ].join(' ').trim() } withName: MMSEQS_CREATETSV { publishDir = [ - path: { "${params.outdir}/taxonomic_classification/mmseqs_createtsv/${meta.id}/" }, + path: { "${params.outdir}/taxa_classification/mmseqs_createtsv/${meta.id}/" }, mode: params.publish_dir_mode, - enabled: params.run_taxonomic_classification, + enabled: params.run_taxa_classification, pattern: "*.tsv", saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] diff --git a/conf/test_taxonomy.config b/conf/test_taxonomy.config index d8743664..a168f0c4 100644 --- a/conf/test_taxonomy.config +++ b/conf/test_taxonomy.config @@ -24,7 +24,7 @@ params { bgc_hmmsearch_models = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm' amp_hmmsearch_models = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm' - run_taxonomic_classification = true + run_taxa_classification = true annotation_tool = 'prodigal' run_arg_screening = true diff --git a/docs/output.md b/docs/output.md index 527e5604..c23c6169 100644 --- a/docs/output.md +++ b/docs/output.md @@ -8,7 +8,7 @@ The output of nf-core/funcscan provides reports for each of the functional group - antimicrobial peptides (tools: [Macrel](https://github.com/BigDataBiology/macrel), [AMPlify](https://github.com/bcgsc/AMPlify), [ampir](https://ampir.marine-omics.net), [hmmsearch](http://hmmer.org) – summarised by [AMPcombi](https://github.com/Darcy220606/AMPcombi)) - biosynthetic gene clusters (tools: [antiSMASH](https://docs.antismash.secondarymetabolites.org), [DeepBGC](https://github.com/Merck/deepbgc), [GECCO](https://gecco.embl.de), [hmmsearch](http://hmmer.org) – summarised by [comBGC](#combgc)) -As a general workflow, we recommend to first look at the summary reports ([ARGs](#hamronization), [AMPs](#ampcombi), [BGCs](#combgc)), to get a general overview of what hits have been found across all the tools of each functional group. After which, you can explore the specific output directories of each tool to get more detailed information about each result. The tool-specific output directories also includes the output from the functional annotation steps of either [prokka](https://github.com/tseemann/prokka), [pyrodigal](https://github.com/althonos/pyrodigal), [prodigal](https://github.com/hyattpd/Prodigal), or [Bakta](https://github.com/oschwengers/bakta) if the `--save_annotations` flag was set. Additionally, taxonomic classifications from [MMseqs2](https://github.com/soedinglab/MMseqs2) are saved if the `taxonomy_mmseqs_save_intermedfiles` flag was set. +As a general workflow, we recommend to first look at the summary reports ([ARGs](#hamronization), [AMPs](#ampcombi), [BGCs](#combgc)), to get a general overview of what hits have been found across all the tools of each functional group. After which, you can explore the specific output directories of each tool to get more detailed information about each result. The tool-specific output directories also includes the output from the functional annotation steps of either [prokka](https://github.com/tseemann/prokka), [pyrodigal](https://github.com/althonos/pyrodigal), [prodigal](https://github.com/hyattpd/Prodigal), or [Bakta](https://github.com/oschwengers/bakta) if the `--save_annotations` flag was set. Additionally, taxonomic classifications from [MMseqs2](https://github.com/soedinglab/MMseqs2) are saved if the `taxa_classification_mmseqs_databases_savetmp` and `taxa_classification_mmseqs_taxonomy_savetmp` flags are set. Similarly, all downloaded databases are saved (i.e. from [MMseqs2](https://github.com/soedinglab/MMseqs2), [antiSMASH](https://docs.antismash.secondarymetabolites.org), [AMRFinderPlus](https://www.ncbi.nlm.nih.gov/pathogens/antimicrobial-resistance/AMRFinder), [Bakta](https://github.com/oschwengers/bakta), [DeepARG](https://bitbucket.org/gusphdproj/deeparg-ss/src/master), and/or [AMPcombi](https://github.com/Darcy220606/AMPcombi)) into the output directory `/databases/` if the `--save_databases` flag was set. @@ -107,7 +107,7 @@ Output Summaries: - `taxonomic_classification/mmseqs2_createtsv/` - `/`: - `*.tsv`: tab separated table containing the taxonomic lineage of every contig only when available in the database. When a contig cannot be classified according to the database, it is assigned in the 'lineage' column as 'no ranK | unclassified'. -- `reports//_complete_summary_taxonomy.tsv.gz`: tab separated table containing the concatenated results from the summary tables along with the taxonomic classification if the parameter `run_taxonomic_classification` is called. +- `reports//_complete_summary_taxonomy.tsv.gz`: tab separated table containing the concatenated results from the summary tables along with the taxonomic classification if the parameter `run_taxa_classification` is called.
[MMseqs2](https://github.com/soedinglab/MMseqs2) classifies the taxonomic lineage of contigs based on the last common ancestor. The inferred taxonomic lineages are included in the final workflow summaries to annotate the potential source bacteria of the identified AMPs, ARGs, and/or BGCs. diff --git a/docs/usage.md b/docs/usage.md index 0c4884c0..98ed2b5f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -80,13 +80,13 @@ MMseqs2 is currently the only taxonomic classification tool used in the pipeline - a custom based database created by the user using `mmseqs createdb` externally and beforehand. If this flag is assigned, this database takes precedence over the default database in ` mmseqs_databases_id`. ``` -mmseqs_databases_localpath 'path/to/mmsesqs_custom_database/dir' +taxa_classification_mmseqs_databases_localpath 'path/to/mmsesqs_custom_database/dir' ``` - an MMseqs2 ready database. These databases were compiled by the developers of MMseqs2 and can be called using their labels. All available options can be found [here](https://github.com/soedinglab/MMseqs2/wiki#downloading-databases). Only use those databases that have taxonomy files available (i.e., Taxonomy == Yes). By default mmseqs2 in the pipeline uses 'Kalamari' and runs an aminoacid based alignment. ``` -mmseqs_databases_id 'Kalamari' +taxa_classification_mmseqs_databases_id 'Kalamari' ``` ### antiSMASH diff --git a/modules/local/merge_taxonomy_ampcombi.nf b/modules/local/merge_taxonomy_ampcombi.nf index c9f3e30d..26e38343 100644 --- a/modules/local/merge_taxonomy_ampcombi.nf +++ b/modules/local/merge_taxonomy_ampcombi.nf @@ -11,8 +11,8 @@ process MERGE_TAXONOMY_AMPCOMBI { path(taxa_list) output: - path("ampcombi_complete_summary_taxonomy.tsv") , emit: tsv - path "versions.yml" , emit: versions + path "ampcombi_complete_summary_taxonomy.tsv" , emit: tsv + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when diff --git a/modules/local/merge_taxonomy_combgc.nf b/modules/local/merge_taxonomy_combgc.nf index 95043310..075668f2 100644 --- a/modules/local/merge_taxonomy_combgc.nf +++ b/modules/local/merge_taxonomy_combgc.nf @@ -11,8 +11,8 @@ process MERGE_TAXONOMY_COMBGC { path(taxa_list) output: - path("combgc_complete_summary_taxonomy.tsv") , emit: tsv - path "versions.yml" , emit: versions + path "combgc_complete_summary_taxonomy.tsv" , emit: tsv + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when diff --git a/modules/local/merge_taxonomy_hamronization.nf b/modules/local/merge_taxonomy_hamronization.nf index 9c50bc12..14b85ff2 100644 --- a/modules/local/merge_taxonomy_hamronization.nf +++ b/modules/local/merge_taxonomy_hamronization.nf @@ -11,8 +11,8 @@ process MERGE_TAXONOMY_HAMRONIZATION { path(taxa_list) output: - path("hamronization_complete_summary_taxonomy.tsv") , emit: tsv - path "versions.yml" , emit: versions + path "hamronization_complete_summary_taxonomy.tsv" , emit: tsv + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when diff --git a/nextflow.config b/nextflow.config index 6ac9c0a9..ccfd2bcd 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,182 +10,182 @@ params { // Input options - input = null + input = null // Taxonomy classification options - run_taxonomic_classification = false - taxonomic_classification_tool = 'mmseqs2' - - mmseqs_databases_localpath = null - mmseqs_databases_id = 'Kalamari' - mmseqs_databases_savetmp = false - - mmseqs_taxonomy_savetmp = false - mmseqs_taxonomy_searchtype = 2 - mmseqs_taxonomy_lcaranks = 'kingdom,phylum,class,order,family,genus,species' - mmseqs_taxonomy_taxlineage = 1 - mmseqs_taxonomy_sensitivity = '5.0' - mmseqs_taxonomy_orffilters = '2.0' - mmseqs_taxonomy_lcamode = 3 - mmseqs_taxonomy_votemode = 1 + run_taxa_classification = false + taxa_classification_tool = 'mmseqs2' + + taxa_classification_mmseqs_databases_localpath = null + taxa_classification_mmseqs_databases_id = 'Kalamari' + taxa_classification_mmseqs_databases_savetmp = false + + taxa_classification_mmseqs_taxonomy_savetmp = false + taxa_classification_mmseqs_taxonomy_searchtype = 2 + taxa_classification_mmseqs_taxonomy_lcaranks = 'kingdom,phylum,class,order,family,genus,species' + taxa_classification_mmseqs_taxonomy_taxlineage = 1 + taxa_classification_mmseqs_taxonomy_sensitivity = '5.0' + taxa_classification_mmseqs_taxonomy_orffilters = '2.0' + taxa_classification_mmseqs_taxonomy_lcamode = 3 + taxa_classification_mmseqs_taxonomy_votemode = 1 // Annotation options - annotation_tool = 'pyrodigal' - save_annotations = false - - annotation_prodigal_singlemode = false - annotation_prodigal_closed = false - annotation_prodigal_transtable = 11 - annotation_prodigal_forcenonsd = false - - annotation_pyrodigal_singlemode = false - annotation_pyrodigal_closed = false - annotation_pyrodigal_transtable = 11 - annotation_pyrodigal_forcenonsd = false - - annotation_bakta_db_localpath = null - annotation_bakta_db_downloadtype = 'full' - annotation_bakta_singlemode = false - annotation_bakta_mincontiglen = 1 - annotation_bakta_translationtable = 11 - annotation_bakta_gram = '?' - annotation_bakta_complete = false - annotation_bakta_renamecontigheaders = false - annotation_bakta_compliant = false - annotation_bakta_trna = false - annotation_bakta_tmrna = false - annotation_bakta_rrna = false - annotation_bakta_ncrna = false - annotation_bakta_ncrnaregion = false - annotation_bakta_crispr = false - annotation_bakta_skipcds = false - annotation_bakta_pseudo = false - annotation_bakta_skipsorf = false - annotation_bakta_gap = false - annotation_bakta_ori = false - annotation_bakta_activate_plot = false - - annotation_prokka_singlemode = false - annotation_prokka_rawproduct = false - annotation_prokka_kingdom = 'Bacteria' - annotation_prokka_gcode = 11 - annotation_prokka_cdsrnaolap = false - annotation_prokka_rnammer = false - annotation_prokka_mincontiglen = 1 - annotation_prokka_evalue = 0.000001 - annotation_prokka_coverage = 80 - annotation_prokka_compliant = true - annotation_prokka_addgenes = false - annotation_prokka_retaincontigheaders = false + annotation_tool = 'pyrodigal' + save_annotations = false + + annotation_prodigal_singlemode = false + annotation_prodigal_closed = false + annotation_prodigal_transtable = 11 + annotation_prodigal_forcenonsd = false + + annotation_pyrodigal_singlemode = false + annotation_pyrodigal_closed = false + annotation_pyrodigal_transtable = 11 + annotation_pyrodigal_forcenonsd = false + + annotation_bakta_db_localpath = null + annotation_bakta_db_downloadtype = 'full' + annotation_bakta_singlemode = false + annotation_bakta_mincontiglen = 1 + annotation_bakta_translationtable = 11 + annotation_bakta_gram = '?' + annotation_bakta_complete = false + annotation_bakta_renamecontigheaders = false + annotation_bakta_compliant = false + annotation_bakta_trna = false + annotation_bakta_tmrna = false + annotation_bakta_rrna = false + annotation_bakta_ncrna = false + annotation_bakta_ncrnaregion = false + annotation_bakta_crispr = false + annotation_bakta_skipcds = false + annotation_bakta_pseudo = false + annotation_bakta_skipsorf = false + annotation_bakta_gap = false + annotation_bakta_ori = false + annotation_bakta_activate_plot = false + + annotation_prokka_singlemode = false + annotation_prokka_rawproduct = false + annotation_prokka_kingdom = 'Bacteria' + annotation_prokka_gcode = 11 + annotation_prokka_cdsrnaolap = false + annotation_prokka_rnammer = false + annotation_prokka_mincontiglen = 1 + annotation_prokka_evalue = 0.000001 + annotation_prokka_coverage = 80 + annotation_prokka_compliant = true + annotation_prokka_addgenes = false + annotation_prokka_retaincontigheaders = false // Database downloading options - save_databases = false + save_databases = false // AMP options - run_amp_screening = false + run_amp_screening = false - amp_skip_amplify = false + amp_skip_amplify = false - amp_skip_macrel = false + amp_skip_macrel = false - amp_skip_ampir = false - amp_ampir_model = 'precursor' - amp_ampir_minlength = 10 + amp_skip_ampir = false + amp_ampir_model = 'precursor' + amp_ampir_minlength = 10 - amp_skip_hmmsearch = false - amp_hmmsearch_models = null - amp_hmmsearch_savealignments = false - amp_hmmsearch_savetargets = false - amp_hmmsearch_savedomains = false + amp_skip_hmmsearch = false + amp_hmmsearch_models = null + amp_hmmsearch_savealignments = false + amp_hmmsearch_savetargets = false + amp_hmmsearch_savedomains = false - amp_ampcombi_db = null - amp_ampcombi_cutoff = 0 + amp_ampcombi_db = null + amp_ampcombi_cutoff = 0 // ARG options - run_arg_screening = false - - arg_skip_fargene = false - arg_fargene_hmmmodel = 'class_a,class_b_1_2,class_b_3,class_c,class_d_1,class_d_2,qnr,tet_efflux,tet_rpg,tet_enzyme' - arg_fargene_savetmpfiles = false - arg_fargene_minorflength = 90 - arg_fargene_score = null - arg_fargene_translationformat = 'pearson' - arg_fargene_orffinder = false - - arg_skip_rgi = false - arg_rgi_savejson = false - arg_rgi_savetmpfiles = false - arg_rgi_alignmenttool = 'BLAST' - arg_rgi_includeloose = true - arg_rgi_excludenudge = true - arg_rgi_lowquality = false - arg_rgi_data = 'NA' - - arg_skip_amrfinderplus = false - arg_amrfinderplus_db = null - arg_amrfinderplus_identmin = -1 - arg_amrfinderplus_coveragemin = 0.5 - arg_amrfinderplus_translationtable = 11 - arg_amrfinderplus_plus = false - arg_amrfinderplus_name = false - - arg_skip_deeparg = false - arg_deeparg_data = null - arg_deeparg_data_version = 2 // Make sure to update on module version bump! - arg_deeparg_model = 'LS' - arg_deeparg_minprob = 0.8 - arg_deeparg_alignmentidentity = 50 - arg_deeparg_alignmentevalue = 1e-10 - arg_deeparg_alignmentoverlap = 0.8 - arg_deeparg_numalignmentsperentry = 1000 - - arg_skip_abricate = false - arg_abricate_db = 'ncbi' - arg_abricate_minid = 80 - arg_abricate_mincov = 80 - - arg_hamronization_summarizeformat = 'tsv' + run_arg_screening = false + + arg_skip_fargene = false + arg_fargene_hmmmodel = 'class_a,class_b_1_2,class_b_3,class_c,class_d_1,class_d_2,qnr,tet_efflux,tet_rpg,tet_enzyme' + arg_fargene_savetmpfiles = false + arg_fargene_minorflength = 90 + arg_fargene_score = null + arg_fargene_translationformat = 'pearson' + arg_fargene_orffinder = false + + arg_skip_rgi = false + arg_rgi_savejson = false + arg_rgi_savetmpfiles = false + arg_rgi_alignmenttool = 'BLAST' + arg_rgi_includeloose = true + arg_rgi_excludenudge = true + arg_rgi_lowquality = false + arg_rgi_data = 'NA' + + arg_skip_amrfinderplus = false + arg_amrfinderplus_db = null + arg_amrfinderplus_identmin = -1 + arg_amrfinderplus_coveragemin = 0.5 + arg_amrfinderplus_translationtable = 11 + arg_amrfinderplus_plus = false + arg_amrfinderplus_name = false + + arg_skip_deeparg = false + arg_deeparg_data = null + arg_deeparg_data_version = 2 // Make sure to update on module version bump! + arg_deeparg_model = 'LS' + arg_deeparg_minprob = 0.8 + arg_deeparg_alignmentidentity = 50 + arg_deeparg_alignmentevalue = 1e-10 + arg_deeparg_alignmentoverlap = 0.8 + arg_deeparg_numalignmentsperentry = 1000 + + arg_skip_abricate = false + arg_abricate_db = 'ncbi' + arg_abricate_minid = 80 + arg_abricate_mincov = 80 + + arg_hamronization_summarizeformat = 'tsv' // BGC options - run_bgc_screening = false - - bgc_skip_antismash = false - bgc_antismash_databases = null - bgc_antismash_installationdirectory = null - bgc_antismash_cbgeneral = false - bgc_antismash_cbknownclusters = false - bgc_antismash_cbsubclusters = false - bgc_antismash_smcogtrees = false - bgc_antismash_ccmibig = false - bgc_antismash_contigminlength = 1000 - bgc_antismash_hmmdetectionstrictness = 'relaxed' - bgc_antismash_taxon = 'bacteria' - bgc_antismash_sampleminlength = 1000 - - bgc_skip_deepbgc = false - bgc_deepbgc_database = null - bgc_deepbgc_score = 0.5 - bgc_deepbgc_prodigalsinglemode = false - bgc_deepbgc_mergemaxproteingap = 0 - bgc_deepbgc_mergemaxnuclgap = 0 - bgc_deepbgc_minnucl = 1 - bgc_deepbgc_minproteins = 1 - bgc_deepbgc_mindomains = 1 - bgc_deepbgc_minbiodomains = 0 - bgc_deepbgc_classifierscore = 0.5 - - bgc_skip_gecco = false - bgc_gecco_cds = 3 - bgc_gecco_threshold = 0.8 - bgc_gecco_pfilter = 0.000000001 - bgc_gecco_edgedistance = 0 - bgc_gecco_mask = false - - bgc_skip_hmmsearch = false - bgc_hmmsearch_models = null - bgc_hmmsearch_savealignments = false - bgc_hmmsearch_savetargets = false - bgc_hmmsearch_savedomains = false + run_bgc_screening = false + + bgc_skip_antismash = false + bgc_antismash_databases = null + bgc_antismash_installationdirectory = null + bgc_antismash_cbgeneral = false + bgc_antismash_cbknownclusters = false + bgc_antismash_cbsubclusters = false + bgc_antismash_smcogtrees = false + bgc_antismash_ccmibig = false + bgc_antismash_contigminlength = 1000 + bgc_antismash_hmmdetectionstrictness = 'relaxed' + bgc_antismash_taxon = 'bacteria' + bgc_antismash_sampleminlength = 1000 + + bgc_skip_deepbgc = false + bgc_deepbgc_database = null + bgc_deepbgc_score = 0.5 + bgc_deepbgc_prodigalsinglemode = false + bgc_deepbgc_mergemaxproteingap = 0 + bgc_deepbgc_mergemaxnuclgap = 0 + bgc_deepbgc_minnucl = 1 + bgc_deepbgc_minproteins = 1 + bgc_deepbgc_mindomains = 1 + bgc_deepbgc_minbiodomains = 0 + bgc_deepbgc_classifierscore = 0.5 + + bgc_skip_gecco = false + bgc_gecco_cds = 3 + bgc_gecco_threshold = 0.8 + bgc_gecco_pfilter = 0.000000001 + bgc_gecco_edgedistance = 0 + bgc_gecco_mask = false + + bgc_skip_hmmsearch = false + bgc_hmmsearch_models = null + bgc_hmmsearch_savealignments = false + bgc_hmmsearch_savetargets = false + bgc_hmmsearch_savedomains = false // MultiQC options multiqc_config = null diff --git a/nextflow_schema.json b/nextflow_schema.json index a0d58dec..275583f5 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -74,13 +74,13 @@ "description": "These options influence whether to activate the taxonomic classification of the input nucleotide sequences.", "default": "", "properties": { - "run_taxonomic_classification": { + "run_taxa_classification": { "type": "boolean", "description": "Activates the taxonomic classification of input mucleotide sequences.", "help_text": "This flag turns on the taxonomic classification of input nucleotide sequences. The taxonomic annotations should be turned on if the input metagenomes' bacterial sources are unknown, which can help identify the source of the AMP, BGC or ARG hit obtained for laboratory experiments. This flag should be turned off (which is by deafult) if the input nucleotide sequences represnet a single known genome or *nf-core/mag* was run beforhand. Turning on this flag relatively decreases the pipleine speed and requires >8GB RAM. Due to the size of the resulting table, the final complete summary is in a zipped format.", "fa_icon": "fas fa-ad" }, - "taxonomic_classification_tool": { + "taxa_classification_tool": { "type": "string", "default": "mmseqs2", "help_text": "This flag specifies which tool for taxonomic classification should be activated. At the moment only 'MMseqs2' is incorporated in the pipeline.", @@ -96,19 +96,19 @@ "description": "These parameters influence the database to be used in classifying the taxonomy.", "default": "", "properties": { - "mmseqs_databases_localpath": { + "taxa_classification_mmseqs_databases_localpath": { "description": "Specify a path to MMseqs2-formatted database.", "help_text": "Specify a path to a database that is prepared in MMseqs2 format as detailed in the [documentation](https://mmseqs.com/latest/userguide.pdf).", "fa_icon": "fab fa-stackpath" }, - "mmseqs_databases_id": { + "taxa_classification_mmseqs_databases_id": { "type": "string", "default": "Kalamari", "help_text": "Specify which MMseqs2-formatted database to use to classify the input contigs. This can be a nucleotide or amino acid database that includes taxonomic classifications. For example, both GTDB (an amico acid database) and SILVA (a nucleotide database) are supported by MMseqs2. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf).\\n\\n> Modifies tool parameter(s):\\n> - mmseqs databases \".", "description": "Specify the label of the database to be used.", "fa_icon": "fas fa-address-card" }, - "mmseqs_databases_savetmp": { + "taxa_classification_mmseqs_databases_savetmp": { "type": "boolean", "help_text": "This flag saves the temporary files from downloading the database and formatting them in the MMseqs2 format into the output folder. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs databases: `--remove-tmp-files`\".", "description": "Specify whether the temporary files should be saved.", @@ -123,55 +123,55 @@ "description": "These parameters influence the taxonomic classification step.", "default": "", "properties": { - "mmseqs_taxonomy_savetmp": { + "taxa_classification_mmseqs_taxonomy_savetmp": { "type": "boolean", "help_text": "This flag saves the temporary files from creating the taxonomy database and the final `tsv` file into the output folder. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--remove-tmp-files`\".\n", "description": "Specify whether to save the temporary files.", "fa_icon": "fab fa-adversal" }, - "mmseqs_taxonomy_searchtype": { + "taxa_classification_mmseqs_taxonomy_searchtype": { "type": "integer", "default": 2, "help_text": "Specify the type of alignment to be carried out between the query database and the reference MMseqs2 database. This can be set to '0' for automatic detection, '1' for amino acid alignment, '2' for translating the inputs and running the alignment on the translated sequences, '3' nucleotide based alignment and '4' for the translated nucleotide sequences alignment. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--search-type`\".", "description": "Specify the alignment type between database and query.", "fa_icon": "fas fa-align-center" }, - "mmseqs_taxonomy_lcaranks": { + "taxa_classification_mmseqs_taxonomy_lcaranks": { "type": "string", "default": "kingdom,phylum,class,order,family,genus,species", "help_text": "Specify the taxonomic ranks to include in the taxonomic lineage column in the final `.tsv` file. For example, 'kingdom,phylum,class,order,family,genus,species'. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--lca-ranks`\".", "description": "Specify the taxonomic levels to display in the result table.", "fa_icon": "fas fa-stream" }, - "mmseqs_taxonomy_taxlineage": { + "taxa_classification_mmseqs_taxonomy_taxlineage": { "type": "integer", "default": 1, "help_text": "This flag specifies whether the taxonomic lineage should be included in the output `.tsv` file. The taxonomic lineage is obtained from the internal module of `mmseqs/taxonomy` that infers the last common ancestor to classify the taxonomy. A value of '0' writes no taxonomic lineage, a value of '1' adds a column with the full lineage names prefixed with abbreviation of the lineage level, e.g., k_Prokaryotes;p_Bacteroidetes;c_....;o_....;f_....;g_....;s_...., while a value of '2' adds a column with the full NCBI taxids lineage,e.g., 1324;2345;4546;5345. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--tax-lineage`\".", "description": "Specify whether to include or remove the taxonomic lineage.", "fa_icon": "fab fa-audible" }, - "mmseqs_taxonomy_sensitivity": { + "taxa_classification_mmseqs_taxonomy_sensitivity": { "type": "string", "default": "5.0", "help_text": "This flag specifies the speed and sensitivity of the taxonomic search. It stands for how many kmers should be produced during the preliminary seeding stage. A very fast search requires a low value e.g., '1.0' and a a very sensitive search requires e.g., '7.0'. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--s`\".", "description": "Specify the speed and sensitivity for taxonomy assignment.", "fa_icon": "fas fa-history" }, - "mmseqs_taxonomy_orffilters": { + "taxa_classification_mmseqs_taxonomy_orffilters": { "type": "string", "default": "2.0", "help_text": "This flag specifies the sensitivity used for prefiltering the query ORF. Before the taxonomy-assigning step, MMseqs2 searches the predicted ORFs against the provided database. This value influences the speed with which the search is carried out. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--orf-filter-s`\".", "description": "Specify the ORF search sensitivity in the prefilter step.", "fa_icon": "fas fa-clock" }, - "mmseqs_taxonomy_lcamode": { + "taxa_classification_mmseqs_taxonomy_lcamode": { "type": "integer", "default": 3, "help_text": "This flag specifies the strategy used for assigning the last common ancestor (LCA). MMseqs2 assigns taxonomy based on an accelerated approximation of the 2bLCA protocol and uses the value of '3'. In this mode, the taxonomic assignment is based not only on usual alignment parameters but also considers the taxonomic classification of the LCA. When the value '4' is used the LCA is assigned based on all the equal scoring top hits. If the value '1' is used the LCA assignment is disregarded and the taxonomic assignment is based on usual alignment parameters like evalue and coverage. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--lca-mode`\".", "description": "Specify the mode to assign the taxonomy.", "fa_icon": "fas fa-broom" }, - "mmseqs_taxonomy_votemode": { + "taxa_classification_mmseqs_taxonomy_votemode": { "type": "integer", "default": 1, "help_text": "This flag assigns the mode value with which the weights are computed. The value of '0' stands for uniform weights of taxonomy assignments, the value of '1' uses the minus log E-value and '2' the actual score. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--vote-mode`\".", diff --git a/subworkflows/local/amp.nf b/subworkflows/local/amp.nf index 04b6f88d..ea5f1c6e 100644 --- a/subworkflows/local/amp.nf +++ b/subworkflows/local/amp.nf @@ -107,14 +107,14 @@ workflow AMP { ch_versions = ch_versions.mix(AMPCOMBI.out.versions) //AMPCOMBI concatenation - if ( !params.run_taxonomic_classification ) { + if ( !params.run_taxa_classification ) { ch_ampcombi_summaries = AMPCOMBI.out.csv.map{ it[1] }.collectFile(name: 'ampcombi_complete_summary.tsv', storeDir: "${params.outdir}/reports/ampcombi",keepHeader:true) } else { ch_ampcombi_summaries = AMPCOMBI.out.csv.map{ it[1] }.collectFile(name: 'ampcombi_complete_summary.tsv', keepHeader:true) } // MERGE_TAXONOMY - if ( params.run_taxonomic_classification ) { + if ( params.run_taxa_classification ) { ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() MERGE_TAXONOMY_AMPCOMBI(ch_ampcombi_summaries, ch_mmseqs_taxonomy_list) diff --git a/subworkflows/local/taxa_class.nf b/subworkflows/local/taxa_class.nf index 86385668..eb03836d 100644 --- a/subworkflows/local/taxa_class.nf +++ b/subworkflows/local/taxa_class.nf @@ -18,16 +18,16 @@ workflow TAXA_CLASS { ch_taxonomy_querydb_taxdb = Channel.empty() ch_taxonomy_tsv = Channel.empty() - if ( params.taxonomic_classification_tool == 'mmseqs2') { + if ( params.taxa_classification_tool == 'mmseqs2') { // Download the ref db if not supplied by user // MMSEQS_DATABASE - if ( params.mmseqs_databases_localpath != null ) { + if ( params.taxa_classification_mmseqs_databases_localpath != null ) { ch_mmseqs_db = Channel - .fromPath( params.mmseqs_databases_localpath ) + .fromPath( params.taxa_classification_mmseqs_databases_localpath ) .first() } else { - MMSEQS_DATABASES ( params.mmseqs_databases_id ) + MMSEQS_DATABASES ( params.taxa_classification_mmseqs_databases_id ) ch_versions = ch_versions.mix( MMSEQS_DATABASES.out.versions ) ch_mmseqs_db = ( MMSEQS_DATABASES.out.database ) } diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 089b15ee..a5f448ff 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -112,7 +112,7 @@ workflow FUNCSCAN { // The final subworkflow reports need taxonomic classification. // This can be either on NT or AA level depending on annotation. // TODO: Only NT at the moment. AA tax. classification will be added only when its PR is merged. - if ( params.run_taxonomic_classification ) { + if ( params.run_taxa_classification ) { TAXA_CLASS ( ch_prepped_input ) ch_versions = ch_versions.mix(TAXA_CLASS.out.versions) ch_taxonomy_tsv = TAXA_CLASS.out.sample_taxonomy @@ -203,7 +203,7 @@ workflow FUNCSCAN { /* AMPs */ - if ( params.run_amp_screening && !params.run_taxonomic_classification ) { + if ( params.run_amp_screening && !params.run_taxa_classification ) { AMP ( ch_prepped_input, ch_annotation_faa @@ -216,7 +216,7 @@ workflow FUNCSCAN { ch_taxonomy_tsv ) ch_versions = ch_versions.mix(AMP.out.versions) - } else if ( params.run_amp_screening && params.run_taxonomic_classification ) { + } else if ( params.run_amp_screening && params.run_taxa_classification ) { AMP ( ch_prepped_input, ch_annotation_faa @@ -238,7 +238,7 @@ workflow FUNCSCAN { /* ARGs */ - if ( params.run_arg_screening && !params.run_taxonomic_classification ) { + if ( params.run_arg_screening && !params.run_taxa_classification ) { if (params.arg_skip_deeparg) { ARG ( ch_prepped_input, @@ -258,7 +258,7 @@ workflow FUNCSCAN { ) } ch_versions = ch_versions.mix(ARG.out.versions) - } else if ( params.run_arg_screening && params.run_taxonomic_classification ) { + } else if ( params.run_arg_screening && params.run_taxa_classification ) { if (params.arg_skip_deeparg) { ARG ( ch_prepped_input, @@ -293,7 +293,7 @@ workflow FUNCSCAN { /* BGCs */ - if ( params.run_bgc_screening && !params.run_taxonomic_classification ) { + if ( params.run_bgc_screening && !params.run_taxa_classification ) { BGC ( ch_prepped_input, ch_annotation_gff @@ -317,7 +317,7 @@ workflow FUNCSCAN { ch_taxonomy_tsv ) ch_versions = ch_versions.mix(BGC.out.versions) - } else if ( params.run_bgc_screening && params.run_taxonomic_classification ) { + } else if ( params.run_bgc_screening && params.run_taxa_classification ) { BGC ( ch_prepped_input, ch_annotation_gff From 0c61e7f67bc2a8bfd44847c208f7bed89431ca35 Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Fri, 29 Mar 2024 02:31:54 +0100 Subject: [PATCH 34/39] fix params in arg and bgc nf --- subworkflows/local/arg.nf | 2 +- subworkflows/local/bgc.nf | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/arg.nf b/subworkflows/local/arg.nf index a8418fa3..f2724867 100644 --- a/subworkflows/local/arg.nf +++ b/subworkflows/local/arg.nf @@ -149,7 +149,7 @@ workflow ARG { ch_versions = ch_versions.mix(HAMRONIZATION_SUMMARIZE.out.versions) // MERGE_TAXONOMY - if ( params.run_taxonomic_classification ) { + if ( params.run_taxa_classification ) { ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() MERGE_TAXONOMY_HAMRONIZATION(HAMRONIZATION_SUMMARIZE.out.tsv, ch_mmseqs_taxonomy_list) diff --git a/subworkflows/local/bgc.nf b/subworkflows/local/bgc.nf index 0420e09c..edbfada2 100644 --- a/subworkflows/local/bgc.nf +++ b/subworkflows/local/bgc.nf @@ -188,14 +188,14 @@ workflow BGC { ch_versions = ch_versions.mix(COMBGC.out.versions) // COMBGC concatenation - if ( !params.run_taxonomic_classification ) { + if ( !params.run_taxa_classification ) { ch_combgc_summaries = COMBGC.out.tsv.map{ it[1] }.collectFile(name: 'combgc_complete_summary.tsv', storeDir: "${params.outdir}/reports/combgc", keepHeader:true) } else { ch_combgc_summaries = COMBGC.out.tsv.map{ it[1] }.collectFile(name: 'combgc_complete_summary.tsv', keepHeader:true) } // MERGE_TAXONOMY - if ( params.run_taxonomic_classification ) { + if ( params.run_taxa_classification ) { ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() MERGE_TAXONOMY_COMBGC(ch_combgc_summaries, ch_mmseqs_taxonomy_list) From df631e68712aa691f304a2822a1289f17a30490b Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Tue, 2 Apr 2024 11:58:13 +0200 Subject: [PATCH 35/39] add last review suggestions --- docs/usage.md | 14 +++++++------- subworkflows/local/bgc.nf | 11 +++++------ 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 98ed2b5f..2193a3e1 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -79,15 +79,15 @@ MMseqs2 is currently the only taxonomic classification tool used in the pipeline - a custom based database created by the user using `mmseqs createdb` externally and beforehand. If this flag is assigned, this database takes precedence over the default database in ` mmseqs_databases_id`. -``` -taxa_classification_mmseqs_databases_localpath 'path/to/mmsesqs_custom_database/dir' -``` + ```bash + --taxa_classification_mmseqs_databases_localpath 'path/to/mmsesqs_custom_database/dir' + ``` -- an MMseqs2 ready database. These databases were compiled by the developers of MMseqs2 and can be called using their labels. All available options can be found [here](https://github.com/soedinglab/MMseqs2/wiki#downloading-databases). Only use those databases that have taxonomy files available (i.e., Taxonomy == Yes). By default mmseqs2 in the pipeline uses 'Kalamari' and runs an aminoacid based alignment. +- an MMseqs2 ready database. These databases were compiled by the developers of MMseqs2 and can be called using their labels. All available options can be found [here](https://github.com/soedinglab/MMseqs2/wiki#downloading-databases). Only use those databases that have taxonomy files available (i.e., Taxonomy == Yes). By default mmseqs2 in the pipeline uses '[Kalamari](https://github.com/lskatz/Kalamari)' and runs an aminoacid based alignment. -``` -taxa_classification_mmseqs_databases_id 'Kalamari' -``` + ```bash + --taxa_classification_mmseqs_databases_id 'Kalamari' + ``` ### antiSMASH diff --git a/subworkflows/local/bgc.nf b/subworkflows/local/bgc.nf index 0d168fe2..3626c283 100644 --- a/subworkflows/local/bgc.nf +++ b/subworkflows/local/bgc.nf @@ -198,15 +198,14 @@ workflow BGC { if ( params.run_taxa_classification ) { ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() - MERGE_TAXONOMY_COMBGC(ch_combgc_summaries, ch_mmseqs_taxonomy_list) - ch_versions = ch_versions.mix(MERGE_TAXONOMY_COMBGC.out.versions) + MERGE_TAXONOMY_COMBGC( ch_combgc_summaries, ch_mmseqs_taxonomy_list ) + ch_versions = ch_versions.mix( MERGE_TAXONOMY_COMBGC.out.versions ) - ch_tabix_input = Channel.of(['id':'combgc_complete_summary_taxonomy']) + ch_tabix_input = Channel.of( [ 'id':'combgc_complete_summary_taxonomy' ] ) .combine(MERGE_TAXONOMY_COMBGC.out.tsv) - BGC_TABIX_BGZIP(ch_tabix_input) - ch_versions = ch_versions.mix(BGC_TABIX_BGZIP.out.versions) - + BGC_TABIX_BGZIP( ch_tabix_input ) + ch_versions = ch_versions.mix( BGC_TABIX_BGZIP.out.versions ) } emit: From 67255e9797c3f1313405ff7fbf047b044dbdea28 Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Tue, 2 Apr 2024 11:59:36 +0200 Subject: [PATCH 36/39] update usage.md from James --- docs/usage.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 2193a3e1..ea19430e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -79,15 +79,15 @@ MMseqs2 is currently the only taxonomic classification tool used in the pipeline - a custom based database created by the user using `mmseqs createdb` externally and beforehand. If this flag is assigned, this database takes precedence over the default database in ` mmseqs_databases_id`. - ```bash - --taxa_classification_mmseqs_databases_localpath 'path/to/mmsesqs_custom_database/dir' - ``` + ```bash + --taxa_classification_mmseqs_databases_localpath 'path/to/mmsesqs_custom_database/dir' + ``` - an MMseqs2 ready database. These databases were compiled by the developers of MMseqs2 and can be called using their labels. All available options can be found [here](https://github.com/soedinglab/MMseqs2/wiki#downloading-databases). Only use those databases that have taxonomy files available (i.e., Taxonomy == Yes). By default mmseqs2 in the pipeline uses '[Kalamari](https://github.com/lskatz/Kalamari)' and runs an aminoacid based alignment. - ```bash - --taxa_classification_mmseqs_databases_id 'Kalamari' - ``` + ```bash + --taxa_classification_mmseqs_databases_id 'Kalamari' + ``` ### antiSMASH From 3eb03f611861034d8cb05eb61dd7dcf344689493 Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Tue, 2 Apr 2024 12:39:07 +0200 Subject: [PATCH 37/39] fix linting --- conf/modules.config | 2 +- docs/usage.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index ba3487b6..6475bfaa 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -512,7 +512,7 @@ process { publishDir = [ path: { "${params.outdir}/reports/hamronization_summarize" }, mode: params.publish_dir_mode, - saveAs: { (params.run_taxonomic_classification == false) ? it : null } + saveAs: { (params.run_taxa_classification == false) ? it : null } ] } diff --git a/docs/usage.md b/docs/usage.md index ea19430e..27afb2ea 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -23,7 +23,7 @@ To run any of the three screening workflows (AMP, ARG, and/or BGC) or taxonomic - `--run_amp_screening` - `--run_arg_screening` - `--run_bgc_screening` -- `--run_taxonomic_classification` +- `--run_taxa_classification` When switched on, all tools of the given workflow will be run by default. If you don't need specific tools, you can explicitly skip them. For the taxonomic classification, MMseqs2 is currently the only tool implemented in the pipline. From f014be23bd6722c28a939c18f9c81a8c58cd5e6c Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Tue, 2 Apr 2024 13:22:46 +0200 Subject: [PATCH 38/39] update output.md --- docs/output.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/output.md b/docs/output.md index 0a4ce192..8aef8416 100644 --- a/docs/output.md +++ b/docs/output.md @@ -484,7 +484,7 @@ Output Summaries: - `hamronization_summarize/` one of the following: - `hamronization_combined_report.json`: summarised output in .json format - `hamronization_combined_report.tsv`: summarised output in .tsv format when the taxonomic classification is turned off (pipeline default). - - `hamronization_combined_report.tsv.gz`: summarised output in zipped format when the taxonomic classification is turned on by `run_taxonomic_classification`. + - `hamronization_combined_report.tsv.gz`: summarised output in zipped format when the taxonomic classification is turned on by `run_taxa_classification`. - `hamronization_combined_report.html`: interactive output in .html format @@ -541,7 +541,7 @@ Output Summaries: - `comBGC/` - `combgc_complete_summary.tsv`: summarised output from all BGC detection tools used in tsv format (all samples concatenated). This is the output given when the taxonomic classification is not activated, i.e., pipeline default. - - `combgc_complete_summary.tsv.gz`: summarised output in zipped format from all BGC detection tools used in tsv format (all samples concatenated) along with the taxonomic classification obtained when `run_taxonomic_classification` is activated. + - `combgc_complete_summary.tsv.gz`: summarised output in zipped format from all BGC detection tools used in tsv format (all samples concatenated) along with the taxonomic classification obtained when `run_taxa_classification` is activated. - `*/combgc_summary.tsv`: summarised output from all applied BGC detection tools in tsv format for each sample. From d974f15f009e79d33b8a512b0244cabbde3a0015 Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Tue, 2 Apr 2024 16:28:11 +0200 Subject: [PATCH 39/39] add reviewers suggestions --- conf/modules.config | 8 ++++---- conf/test_taxonomy.config | 22 +++++++++++----------- docs/output.md | 14 +++++++------- nextflow_schema.json | 26 +++++++++++++------------- subworkflows/local/taxa_class.nf | 18 +++++++++--------- 5 files changed, 44 insertions(+), 44 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 6475bfaa..eb27c5a1 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -46,7 +46,7 @@ process { withName: MMSEQS_DATABASES { publishDir = [ - path: { "${params.outdir}/databases/mmseqs/" }, // dir==mmseqs_database/ + path: { "${params.outdir}/databases/mmseqs/" }, mode: params.publish_dir_mode, enabled: params.save_databases, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } @@ -520,7 +520,7 @@ process { publishDir = [ path: { "${params.outdir}/reports/hamronization_summarize" }, mode: params.publish_dir_mode, - saveAs: { _ -> null } //do not save the file + saveAs: { _ -> null } // do not save the file ] } @@ -547,7 +547,7 @@ process { publishDir = [ path: { "${params.outdir}/reports/ampcombi" }, mode: params.publish_dir_mode, - saveAs: { _ -> null } //do not save the file + saveAs: { _ -> null } // do not save the file ] } @@ -571,7 +571,7 @@ process { publishDir = [ path: { "${params.outdir}/reports/combgc" }, mode: params.publish_dir_mode, - saveAs: { _ -> null } //do not save the file + saveAs: { _ -> null } // do not save the file ] } diff --git a/conf/test_taxonomy.config b/conf/test_taxonomy.config index a168f0c4..ad477b3c 100644 --- a/conf/test_taxonomy.config +++ b/conf/test_taxonomy.config @@ -20,19 +20,19 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_reduced.csv' - bgc_hmmsearch_models = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm' - amp_hmmsearch_models = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_reduced.csv' + bgc_hmmsearch_models = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm' + amp_hmmsearch_models = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm' - run_taxa_classification = true - annotation_tool = 'prodigal' + run_taxa_classification = true + annotation_tool = 'prodigal' - run_arg_screening = true - arg_skip_deeparg = true - arg_skip_amrfinderplus = true + run_arg_screening = true + arg_skip_deeparg = true + arg_skip_amrfinderplus = true - run_amp_screening = true + run_amp_screening = true - run_bgc_screening = true - bgc_skip_deepbgc = true + run_bgc_screening = true + bgc_skip_deepbgc = true } diff --git a/docs/output.md b/docs/output.md index 8aef8416..f20d1cd2 100644 --- a/docs/output.md +++ b/docs/output.md @@ -8,7 +8,7 @@ The output of nf-core/funcscan provides reports for each of the functional group - antimicrobial peptides (tools: [Macrel](https://github.com/BigDataBiology/macrel), [AMPlify](https://github.com/bcgsc/AMPlify), [ampir](https://ampir.marine-omics.net), [hmmsearch](http://hmmer.org) – summarised by [AMPcombi](https://github.com/Darcy220606/AMPcombi)) - biosynthetic gene clusters (tools: [antiSMASH](https://docs.antismash.secondarymetabolites.org), [DeepBGC](https://github.com/Merck/deepbgc), [GECCO](https://gecco.embl.de), [hmmsearch](http://hmmer.org) – summarised by [comBGC](#combgc)) -As a general workflow, we recommend to first look at the summary reports ([ARGs](#hamronization), [AMPs](#ampcombi), [BGCs](#combgc)), to get a general overview of what hits have been found across all the tools of each functional group. After which, you can explore the specific output directories of each tool to get more detailed information about each result. The tool-specific output directories also includes the output from the functional annotation steps of either [prokka](https://github.com/tseemann/prokka), [pyrodigal](https://github.com/althonos/pyrodigal), [prodigal](https://github.com/hyattpd/Prodigal), or [Bakta](https://github.com/oschwengers/bakta) if the `--save_annotations` flag was set. Additionally, taxonomic classifications from [MMseqs2](https://github.com/soedinglab/MMseqs2) are saved if the `taxa_classification_mmseqs_databases_savetmp` and `taxa_classification_mmseqs_taxonomy_savetmp` flags are set. +As a general workflow, we recommend to first look at the summary reports ([ARGs](#hamronization), [AMPs](#ampcombi), [BGCs](#combgc)), to get a general overview of what hits have been found across all the tools of each functional group. After which, you can explore the specific output directories of each tool to get more detailed information about each result. The tool-specific output directories also includes the output from the functional annotation steps of either [prokka](https://github.com/tseemann/prokka), [pyrodigal](https://github.com/althonos/pyrodigal), [prodigal](https://github.com/hyattpd/Prodigal), or [Bakta](https://github.com/oschwengers/bakta) if the `--save_annotations` flag was set. Additionally, taxonomic classifications from [MMseqs2](https://github.com/soedinglab/MMseqs2) are saved if the `--taxa_classification_mmseqs_databases_savetmp` and `--taxa_classification_mmseqs_taxonomy_savetmp` flags are set. Similarly, all downloaded databases are saved (i.e. from [MMseqs2](https://github.com/soedinglab/MMseqs2), [antiSMASH](https://docs.antismash.secondarymetabolites.org), [AMRFinderPlus](https://www.ncbi.nlm.nih.gov/pathogens/antimicrobial-resistance/AMRFinder), [Bakta](https://github.com/oschwengers/bakta), [DeepARG](https://bitbucket.org/gusphdproj/deeparg-ss/src/master), [RGI](https://github.com/arpcard/rgi), and/or [AMPcombi](https://github.com/Darcy220606/AMPcombi)) into the output directory `/databases/` if the `--save_databases` flag was set. @@ -106,8 +106,8 @@ Output Summaries: - `taxonomic_classification/mmseqs2_createtsv/` - `/`: - - `*.tsv`: tab separated table containing the taxonomic lineage of every contig only when available in the database. When a contig cannot be classified according to the database, it is assigned in the 'lineage' column as 'no ranK | unclassified'. -- `reports//_complete_summary_taxonomy.tsv.gz`: tab separated table containing the concatenated results from the summary tables along with the taxonomic classification if the parameter `run_taxa_classification` is called. + - `*.tsv`: tab-separated table containing the taxonomic lineage of every contig. When a contig cannot be classified according to the database, it is assigned in the 'lineage' column as 'no rank | unclassified'. +- `reports//_complete_summary_taxonomy.tsv.gz`: tab-separated table containing the concatenated results from the summary tables along with the taxonomic classification if the parameter `--run_taxa_classification` is called. [MMseqs2](https://github.com/soedinglab/MMseqs2) classifies the taxonomic lineage of contigs based on the last common ancestor. The inferred taxonomic lineages are included in the final workflow summaries to annotate the potential source bacteria of the identified AMPs, ARGs, and/or BGCs. @@ -424,7 +424,7 @@ Output Summaries: Output files - `ampcombi/` - - `ampcombi_complete_summary.tsv`: tab separated table containing the concatenated results from the ampcombi summary tables. This is the output given when the taxonomic classification is not activated, i.e., pipeline default. + - `ampcombi_complete_summary.tsv`: tab-separated table containing the concatenated results from the AMPcombi summary tables. This is the output given when the taxonomic classification is not activated (pipeline default). - `ampcombi_complete_summary_taxonomy.tsv.gz`: summarised output from all AMP workflow tools with taxonomic assignment in compressed tsv format. - `ampcombi.log`: a log file generated by ampcombi - `/*_ampcombi.csv`: summarised output in csv for each sample @@ -484,7 +484,7 @@ Output Summaries: - `hamronization_summarize/` one of the following: - `hamronization_combined_report.json`: summarised output in .json format - `hamronization_combined_report.tsv`: summarised output in .tsv format when the taxonomic classification is turned off (pipeline default). - - `hamronization_combined_report.tsv.gz`: summarised output in zipped format when the taxonomic classification is turned on by `run_taxa_classification`. + - `hamronization_combined_report.tsv.gz`: summarised output in gzipped format when the taxonomic classification is turned on by `--run_taxa_classification`. - `hamronization_combined_report.html`: interactive output in .html format @@ -540,8 +540,8 @@ Output Summaries: Output files - `comBGC/` - - `combgc_complete_summary.tsv`: summarised output from all BGC detection tools used in tsv format (all samples concatenated). This is the output given when the taxonomic classification is not activated, i.e., pipeline default. - - `combgc_complete_summary.tsv.gz`: summarised output in zipped format from all BGC detection tools used in tsv format (all samples concatenated) along with the taxonomic classification obtained when `run_taxa_classification` is activated. + - `combgc_complete_summary.tsv`: summarised output from all BGC detection tools used in tsv format (all samples concatenated). This is the output given when the taxonomic classification is not activated (pipeline default). + - `combgc_complete_summary.tsv.gz`: summarised output in gzipped format from all BGC detection tools used in tsv format (all samples concatenated) along with the taxonomic classification obtained when `--run_taxa_classification` is activated. - `*/combgc_summary.tsv`: summarised output from all applied BGC detection tools in tsv format for each sample. diff --git a/nextflow_schema.json b/nextflow_schema.json index a9db953d..31678cd0 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -76,8 +76,8 @@ "properties": { "run_taxa_classification": { "type": "boolean", - "description": "Activates the taxonomic classification of input mucleotide sequences.", - "help_text": "This flag turns on the taxonomic classification of input nucleotide sequences. The taxonomic annotations should be turned on if the input metagenomes' bacterial sources are unknown, which can help identify the source of the AMP, BGC or ARG hit obtained for laboratory experiments. This flag should be turned off (which is by deafult) if the input nucleotide sequences represnet a single known genome or *nf-core/mag* was run beforhand. Turning on this flag relatively decreases the pipleine speed and requires >8GB RAM. Due to the size of the resulting table, the final complete summary is in a zipped format.", + "description": "Activates the taxonomic classification of input nucleotide sequences.", + "help_text": "This flag turns on the taxonomic classification of input nucleotide sequences. The taxonomic annotations should be turned on if the input metagenomes' bacterial sources are unknown, which can help identify the source of the AMP, BGC or ARG hit obtained for laboratory experiments. This flag should be turned off (which is by default) if the input nucleotide sequences represent a single known genome or *nf-core/mag* was run beforehand. Turning on this flag relatively decreases the pipeline speed and requires >8GB RAM. Due to the size of the resulting table, the final complete summary is in a zipped format.", "fa_icon": "fas fa-ad" }, "taxa_classification_tool": { @@ -104,13 +104,13 @@ "taxa_classification_mmseqs_databases_id": { "type": "string", "default": "Kalamari", - "help_text": "Specify which MMseqs2-formatted database to use to classify the input contigs. This can be a nucleotide or amino acid database that includes taxonomic classifications. For example, both GTDB (an amico acid database) and SILVA (a nucleotide database) are supported by MMseqs2. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf).\\n\\n> Modifies tool parameter(s):\\n> - mmseqs databases \".", + "help_text": "Specify which MMseqs2-formatted database to use to classify the input contigs. This can be a nucleotide or amino acid database that includes taxonomic classifications. For example, both GTDB (an amico acid database) and SILVA (a nucleotide database) are supported by MMseqs2. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf).\\n\\n> Modifies tool parameter(s):\\n> - mmseqs databases \".", "description": "Specify the label of the database to be used.", "fa_icon": "fas fa-address-card" }, "taxa_classification_mmseqs_databases_savetmp": { "type": "boolean", - "help_text": "This flag saves the temporary files from downloading the database and formatting them in the MMseqs2 format into the output folder. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs databases: `--remove-tmp-files`\".", + "help_text": "This flag saves the temporary files from downloading the database and formatting it in the MMseqs2 format into the output folder. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf).\\n\\n> Modifies tool parameter(s):\\n> - mmseqs databases: `--remove-tmp-files`\".", "description": "Specify whether the temporary files should be saved.", "fa_icon": "fas fa-file-download" } @@ -146,14 +146,14 @@ "taxa_classification_mmseqs_taxonomy_taxlineage": { "type": "integer", "default": 1, - "help_text": "This flag specifies whether the taxonomic lineage should be included in the output `.tsv` file. The taxonomic lineage is obtained from the internal module of `mmseqs/taxonomy` that infers the last common ancestor to classify the taxonomy. A value of '0' writes no taxonomic lineage, a value of '1' adds a column with the full lineage names prefixed with abbreviation of the lineage level, e.g., k_Prokaryotes;p_Bacteroidetes;c_....;o_....;f_....;g_....;s_...., while a value of '2' adds a column with the full NCBI taxids lineage,e.g., 1324;2345;4546;5345. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--tax-lineage`\".", + "help_text": "This flag specifies whether the taxonomic lineage should be included in the output `.tsv` file. The taxonomic lineage is obtained from the internal module of `mmseqs/taxonomy` that infers the last common ancestor to classify the taxonomy. A value of '0' writes no taxonomic lineage, a value of '1' adds a column with the full lineage names prefixed with abbreviation of the lineage level, e.g. k_Prokaryotes;p_Bacteroidetes;c_....;o_....;f_....;g_....;s_...., while a value of '2' adds a column with the full NCBI taxids lineage,e.g. 1324;2345;4546;5345. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--tax-lineage`\".", "description": "Specify whether to include or remove the taxonomic lineage.", "fa_icon": "fab fa-audible" }, "taxa_classification_mmseqs_taxonomy_sensitivity": { "type": "string", "default": "5.0", - "help_text": "This flag specifies the speed and sensitivity of the taxonomic search. It stands for how many kmers should be produced during the preliminary seeding stage. A very fast search requires a low value e.g., '1.0' and a a very sensitive search requires e.g., '7.0'. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--s`\".", + "help_text": "This flag specifies the speed and sensitivity of the taxonomic search. It stands for how many kmers should be produced during the preliminary seeding stage. A very fast search requires a low value e.g. '1.0' and a a very sensitive search requires e.g. '7.0'. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--s`\".", "description": "Specify the speed and sensitivity for taxonomy assignment.", "fa_icon": "fas fa-history" }, @@ -167,7 +167,7 @@ "taxa_classification_mmseqs_taxonomy_lcamode": { "type": "integer", "default": 3, - "help_text": "This flag specifies the strategy used for assigning the last common ancestor (LCA). MMseqs2 assigns taxonomy based on an accelerated approximation of the 2bLCA protocol and uses the value of '3'. In this mode, the taxonomic assignment is based not only on usual alignment parameters but also considers the taxonomic classification of the LCA. When the value '4' is used the LCA is assigned based on all the equal scoring top hits. If the value '1' is used the LCA assignment is disregarded and the taxonomic assignment is based on usual alignment parameters like evalue and coverage. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--lca-mode`\".", + "help_text": "This flag specifies the strategy used for assigning the last common ancestor (LCA). MMseqs2 assigns taxonomy based on an accelerated approximation of the 2bLCA protocol and uses the value of '3'. In this mode, the taxonomic assignment is based not only on usual alignment parameters but also considers the taxonomic classification of the LCA. When the value '4' is used the LCA is assigned based on all the equal scoring top hits. If the value '1' is used the LCA assignment is disregarded and the taxonomic assignment is based on usual alignment parameters like E-value and coverage. More details can be found in the [documentation](https://mmseqs.com/latest/userguide.pdf). \\n\\n> Modifies tool parameter(s):\\n> - mmseqs taxonomy: `--lca-mode`\".", "description": "Specify the mode to assign the taxonomy.", "fa_icon": "fas fa-broom" }, @@ -373,7 +373,7 @@ "default": "Bacteria", "fa_icon": "fab fa-accusoft", "description": "Specify the kingdom that the input represents.", - "help_text": "Specifies the kingdom that the input sample is derived from and/or you wish to screen for\n\n> \u26a0\ufe0f Prokka cannot annotate Eukaryotes.\n\nFor more information please check Prokka [documentation](https://github.com/tseemann/prokka).\n\n> Modifies tool parameter(s):\n> - Prokka: `--kingdom`", + "help_text": "Specifies the kingdom that the input sample is derived from and/or you wish to screen for\n\n> ⚠️ Prokka cannot annotate Eukaryotes.\n\nFor more information please check Prokka [documentation](https://github.com/tseemann/prokka).\n\n> Modifies tool parameter(s):\n> - Prokka: `--kingdom`", "enum": ["Archaea", "Bacteria", "Mitochondria", "Viruses"] }, "annotation_prokka_gcode": { @@ -394,7 +394,7 @@ }, "annotation_prokka_evalue": { "type": "number", - "default": 1e-6, + "default": 0.000001, "description": "Minimum e-value cut-off.", "help_text": "Specifiy the minimum e-value used for filtering the alignment hits.\n\nFor more information please check Prokka [documentation](https://github.com/tseemann/prokka).\n\n> Modifies tool parameter(s):\n> - Prokka: `--evalue`", "fa_icon": "fas fa-sort-amount-down" @@ -642,7 +642,7 @@ }, "amp_ampcombi_cutoff": { "type": "number", - "default": 0, + "default": 0.0, "description": "Specify probability cutoff to filter AMPs", "help_text": "Specify the minimum probability an AMP hit must have to be retained in the final output file. Anything below this threshold will be removed.\n\nFor more information check AMPcombi [documentation](https://github.com/Darcy220606/AMPcombi).\n\n> Modifies tool parameter(s):\n> - AMPCOMBI: `--cutoff`", "fa_icon": "fas fa-sort-amount-up" @@ -671,7 +671,7 @@ }, "arg_amrfinderplus_identmin": { "type": "number", - "default": -1, + "default": -1.0, "help_text": "Specify the minimum percentage amino-acid identity to reference protein or nucleotide identity for nucleotide reference must have if a BLAST alignment (based on methods: BLAST or PARTIAL) was detected, otherwise NA.\n\n If you specify `-1`, this means use a curated threshold if it exists and `0.9` otherwise.\n\nSetting this value to something other than `-1` will override any curated similarity cutoffs. For BLAST: alignment is > 90% of length and > 90% identity to a protein in the AMRFinderPlus database. For PARTIAL: alignment is > 50% of length, but < 90% of length and > 90% identity to the reference, and does not end at a contig boundary.\n\nFor more information check AMRFinderPlus [documentation](https://github.com/ncbi/amr/wiki/Running-AMRFinderPlus#--organism-option).\n\n> Modifies tool parameter(s):\n> - AMRFinderPlus: `--ident_min`", "description": "Minimum percent identity to reference sequence.", "fa_icon": "fas fa-angle-left" @@ -989,7 +989,7 @@ "default": 1000, "description": "Minimum longest-contig length a sample must have to be screened with antiSMASH.", "fa_icon": "fas fa-ruler-horizontal", - "help_text": "This specifies the minimum length that the longest contig must have for the entire sample to be screened by antiSMASH.\n\nAny samples that do not reach this length will be not be sent to antiSMASH, therefore you will not receive output for these samples in your `--outdir`.\n\n> \u26a0\ufe0f This is not the same as `--bgc_antismash_contigminlength`, which specifies to only analyse contigs above that threshold but _within_ a sample that has already passed `--bgc_antismash_sampleminlength` sample filter!" + "help_text": "This specifies the minimum length that the longest contig must have for the entire sample to be screened by antiSMASH.\n\nAny samples that do not reach this length will be not be sent to antiSMASH, therefore you will not receive output for these samples in your `--outdir`.\n\n> ⚠️ This is not the same as `--bgc_antismash_contigminlength`, which specifies to only analyse contigs above that threshold but _within_ a sample that has already passed `--bgc_antismash_sampleminlength` sample filter!" }, "bgc_antismash_contigminlength": { "type": "integer", @@ -1157,7 +1157,7 @@ "type": "number", "description": "The p-value cutoff for protein domains to be included.", "fa_icon": "fas fa-filter", - "default": 1e-9, + "default": 0.000000001, "help_text": "The p-value cutoff for protein domains to be included.\n\nFor more information see the GECCO [documentation](https://github.com/zellerlab/GECCO).\n\n> Modifies tool parameter(s):\n> - GECCO: `--pfilter`" }, "bgc_gecco_threshold": { diff --git a/subworkflows/local/taxa_class.nf b/subworkflows/local/taxa_class.nf index eb03836d..ec9f273a 100644 --- a/subworkflows/local/taxa_class.nf +++ b/subworkflows/local/taxa_class.nf @@ -2,10 +2,10 @@ TAXONOMIC CLASSIFICATION */ -include { MMSEQS_CREATEDB } from '../../modules/nf-core/mmseqs/createdb/main' -include { MMSEQS_DATABASES } from '../../modules/nf-core/mmseqs/databases/main' -include { MMSEQS_TAXONOMY } from '../../modules/nf-core/mmseqs/taxonomy/main' -include { MMSEQS_CREATETSV } from '../../modules/nf-core/mmseqs/createtsv/main' +include { MMSEQS_CREATEDB } from '../../modules/nf-core/mmseqs/createdb/main' +include { MMSEQS_DATABASES } from '../../modules/nf-core/mmseqs/databases/main' +include { MMSEQS_TAXONOMY } from '../../modules/nf-core/mmseqs/taxonomy/main' +include { MMSEQS_CREATETSV } from '../../modules/nf-core/mmseqs/createtsv/main' workflow TAXA_CLASS { take: @@ -35,21 +35,21 @@ workflow TAXA_CLASS { // Create db for query contigs, assign taxonomy and convert to table format // MMSEQS_CREATEDB MMSEQS_CREATEDB ( contigs ) - ch_versions = ch_versions.mix(MMSEQS_CREATEDB.out.versions) - ch_taxonomy_querydb = MMSEQS_CREATEDB.out.db + ch_versions = ch_versions.mix( MMSEQS_CREATEDB.out.versions ) + ch_taxonomy_querydb = MMSEQS_CREATEDB.out.db // MMSEQS_TAXONOMY MMSEQS_TAXONOMY ( ch_taxonomy_querydb, ch_mmseqs_db ) - ch_versions = ch_versions.mix(MMSEQS_TAXONOMY.out.versions) + ch_versions = ch_versions.mix( MMSEQS_TAXONOMY.out.versions ) ch_taxonomy_querydb_taxdb = MMSEQS_TAXONOMY.out.db_taxonomy // MMSEQS_CREATETSV MMSEQS_CREATETSV ( ch_taxonomy_querydb_taxdb, [[:],[]], ch_taxonomy_querydb ) - ch_versions = ch_versions.mix(MMSEQS_CREATETSV.out.versions) + ch_versions = ch_versions.mix( MMSEQS_CREATETSV.out.versions ) ch_taxonomy_tsv = MMSEQS_CREATETSV.out.tsv } emit: versions = ch_versions - sample_taxonomy = ch_taxonomy_tsv //channel: [ val(meta), tsv ] + sample_taxonomy = ch_taxonomy_tsv // channel: [ val(meta), tsv ] }