nf-core · jasmezz · Jun 4, 2024 · Jun 3, 2024 · Jun 3, 2024 · Jun 3, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,8 +13,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#338](https://github.com/nf-core/funcscan/pull/338) Set `--meta` parameter to default for Bakta, with singlemode optional. (by @jasmezz)
 - [#343](https://github.com/nf-core/funcscan/pull/343) Added contig taxonomic classification using [MMseqs2](https://github.com/soedinglab/MMseqs2/). (by @darcy220606)
 - [#358](https://github.com/nf-core/funcscan/pull/358) Improved RGI databases handling, users can supply their own CARD now. (by @jasmezz)
-- [#375](https://github.com/nf-core/funcscan/pull/375) Merged pipeline template of nf-core/tools version 2.14.1 (by @jfy133)
+- [#375](https://github.com/nf-core/funcscan/pull/375) Merged pipeline template of nf-core/tools version 2.14.1. (by @jfy133)
 - [#381](https://github.com/nf-core/funcscan/pull/381) Added support for supplying pre-annotated sequences to the pipeline. (by @jfy133, @jasmezz)
+- [#382](https://github.com/nf-core/funcscan/pull/382) Optimised BGC screening run time and prevent crashes due to too-short contigs by adding contig length filtering for BGC workflow only. (by @jfy133, @darcy220606)
 
 ### `Fixed`
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -83,10 +83,23 @@ process {
         ]
     }
 
+    withName: SEQKIT_SEQ {
+        ext.prefix = { "${meta.id}_long" }
+        publishDir = [
+            path: { "${params.outdir}/bgc/seqkit/" },
+            mode: params.publish_dir_mode,
+            enabled: params.bgc_savefilteredcontigs,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+        ext.args = [
+            "--min-len ${params.bgc_mincontiglength}"
+        ].join(' ').trim()
+    }
+
     withName: PROKKA {
         ext.prefix =  { "${meta.id}_prokka" } // to prevent pigz symlink problems of input files if already uncompressed during post-annotation gzipping
         publishDir = [
-            path: { "${params.outdir}/annotation/prokka/" },
+            path: { "${params.outdir}/annotation/prokka/${meta.category}/" },
             mode: params.publish_dir_mode,
             enabled: params.save_annotations,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
@@ -122,7 +135,7 @@ process {
     withName: BAKTA_BAKTA {
         ext.prefix =  { "${meta.id}_bakta" } // to prevent pigz symlink problems of input files if already uncompressed during post-annotation gzipping
         publishDir = [
-            path: { "${params.outdir}/annotation/bakta/${meta.id}" },
+            path: { "${params.outdir}/annotation/bakta/${meta.category}/" },
             mode: params.publish_dir_mode,
             enabled: params.save_annotations,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
@@ -152,7 +165,7 @@ process {
 
     withName: PRODIGAL {
         publishDir = [
-            path: { "${params.outdir}/annotation/prodigal/${meta.id}" },
+            path: { "${params.outdir}/annotation/prodigal/${meta.category}/" },
             mode: params.publish_dir_mode,
             enabled: params.save_annotations,
             pattern: "*.{faa,fna,gbk,faa.gz,faa.gz,fna.gz,gbk.gz}",
@@ -169,7 +182,7 @@ process {
     withName: PYRODIGAL {
         ext.prefix =  { "${meta.id}_pyrodigal" } // to prevent pigz symlink problems of input files if already uncompressed during post-annotation gzipping
         publishDir = [
-            path: { "${params.outdir}/annotation/pyrodigal/${meta.id}" },
+            path: { "${params.outdir}/annotation/pyrodigal/${meta.category}/" },
             mode: params.publish_dir_mode,
             enabled: params.save_annotations,
             pattern: "*.{faa,fna,gbk,score}.gz",
@@ -281,7 +294,7 @@ process {
         ext.args = params.arg_fargene_orffinder ? '--orf-finder' : ''
     }
 
-    withName:UNTAR_CARD {
+    withName: UNTAR_CARD {
 
         ext.prefix = "card_database"
         publishDir = [
@@ -295,7 +308,7 @@ process {
 
     }
 
-    withName:RGI_CARDANNOTATION {
+    withName: RGI_CARDANNOTATION {
         publishDir = [
             [
                 path: { "${params.outdir}/databases/rgi" },

diff --git a/conf/test_full.config b/conf/test_full.config
@@ -11,33 +11,38 @@
 */
 
 params {
-    config_profile_name        = 'Full test profile'
-    config_profile_description = 'Full test dataset to check pipeline function'
+    config_profile_name           = 'Full test profile'
+    config_profile_description    = 'Full test dataset to check pipeline function'
 
     // Input data for full size test
-    input = params.pipelines_testdata_base_path + 'funcscan/samplesheet_full.csv'
+    input                         = params.pipelines_testdata_base_path + 'funcscan/samplesheet_full.csv'
 
     // Database and annotation options
-    save_annotations                = true
+    save_annotations              = true
 
     // AMP params
-    run_amp_screening               = true
-    amp_hmmsearch_models            = params.pipelines_testdata_base_path + 'funcscan/hmms/mybacteriocin.hmm'
-    amp_hmmsearch_savealignments    = true
-    amp_hmmsearch_savedomains       = true
-    amp_hmmsearch_savetargets       = true
-    amp_skip_amplify                = true
+    run_amp_screening             = true
+    amp_hmmsearch_models          = params.pipelines_testdata_base_path + 'funcscan/hmms/mybacteriocin.hmm'
+    amp_hmmsearch_savealignments  = true
+    amp_hmmsearch_savedomains     = true
+    amp_hmmsearch_savetargets     = true
+    amp_skip_amplify              = true
 
     // ARG params
-    run_arg_screening               = true
-    arg_skip_deeparg                = false
+    run_arg_screening             = true
+    arg_skip_deeparg              = false
 
     // BGC params
-    run_bgc_screening               = true
-    bgc_hmmsearch_models            = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm'
-    bgc_hmmsearch_savealignments    = true
-    bgc_hmmsearch_savetargets       = true
-    bgc_hmmsearch_savedomains       = true
-    bgc_skip_deepbgc                = true // takes too long
+    run_bgc_screening             = true
+    bgc_hmmsearch_models          = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm'
+    bgc_hmmsearch_savealignments  = true
+    bgc_hmmsearch_savetargets     = true
+    bgc_hmmsearch_savedomains     = true
+    bgc_skip_deepbgc              = true // takes too long
+    bgc_mincontiglength           = 1000
+    bgc_savefilteredcontigs       = true
+    bgc_skip_deepbgc              = true
+    bgc_antismash_contigminlength = 1000
+
 
 }
diff --git a/conf/test_taxonomy.config b/conf/test_taxonomy.config
@@ -20,21 +20,25 @@ params {
     max_time   = '6.h'
 
     // Input data
-    input                   = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_reduced.csv'
-    bgc_hmmsearch_models    = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm'
-    amp_hmmsearch_models    = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm'
+    input                         = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_reduced.csv'
+    bgc_hmmsearch_models          = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm'
+    amp_hmmsearch_models          = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm'
 
-    run_taxa_classification = true
-    annotation_tool         = 'pyrodigal'
+    run_taxa_classification       = true
+    annotation_tool               = 'pyrodigal'
+    save_annotations              = true
 
-    run_arg_screening       = true
-    arg_skip_deeparg        = true
-    arg_skip_amrfinderplus  = true
+    run_arg_screening             = true
+    arg_skip_deeparg              = true
+    arg_skip_amrfinderplus        = true
 
-    run_amp_screening       = true
+    run_amp_screening             = true
 
-    run_bgc_screening       = true
-    bgc_skip_deepbgc        = true
+    run_bgc_screening             = true
+    bgc_mincontiglength           = 1000
+    bgc_savefilteredcontigs       = true
+    bgc_skip_deepbgc              = true
+    bgc_antismash_contigminlength = 1000
 }
 
 process {

diff --git a/docs/output.md b/docs/output.md
@@ -130,10 +130,11 @@ Output Summaries:
 <summary>Output files</summary>
 
 - `prodigal/`
-  - `<samplename>/`:
-    - `*.fna`: nucleotide FASTA file of the input contig sequences
-    - `*.faa`: protein FASTA file of the translated CDS sequences
-    - `*.gbk`: annotation in GBK format, containing both sequences and annotations
+  - `category/`: indicates whether annotation files are of all contigs or `long`-only contigs (BGC subworkflow only)
+    - `<samplename>/`:
+      - `*.fna`: nucleotide FASTA file of the input contig sequences
+      - `*.faa`: protein FASTA file of the translated CDS sequences
+      - `*.gbk`: annotation in GBK format, containing both sequences and annotations
 
 > Descriptions taken from the [Prodigal documentation](https://github.com/hyattpd/prodigal/wiki)
 
@@ -147,10 +148,11 @@ Output Summaries:
 <summary>Output files</summary>
 
 - `pyrodigal/`
-  - `<samplename>/`:
-    - `*.gbk`: annotation in GBK format, containing both sequences and annotations
-    - `*.fna`: nucleotide FASTA file of the annotated CDS sequences
-    - `*.faa`: protein FASTA file of the translated CDS sequences
+  - `category/`: indicates whether annotation files are of all contigs or `long`-only contigs (BGC subworkflow only)
+    - `<samplename>/`:
+      - `*.gbk`: annotation in GBK format, containing both sequences and annotations
+      - `*.fna`: nucleotide FASTA file of the annotated CDS sequences
+      - `*.faa`: protein FASTA file of the translated CDS sequences
 
 > Descriptions taken from the [Pyrodigal documentation](https://pyrodigal.readthedocs.io/)
 
@@ -164,19 +166,20 @@ Output Summaries:
 <summary>Output files</summary>
 
 - `prokka/`
-  - `<samplename>/`
-    - `*.gff`: annotation in GFF3 format, containing both sequences and annotations
-    - `*.gbk`: standard Genbank file derived from the master .gff
-    - `*.fna`: nucleotide FASTA file of the input contig sequences
-    - `*.faa`: protein FASTA file of the translated CDS sequences
-    - `*.ffn`: nucleotide FASTA file of all the prediction transcripts (CDS, rRNA, tRNA, tmRNA, misc_RNA)
-    - `*.sqn`: an ASN1 format "Sequin" file for submission to Genbank
-    - `*.fsa`: nucleotide FASTA file of the input contig sequences, used by "tbl2asn" to create the .sqn file
-    - `*.tbl`: feature Table file, used by "tbl2asn" to create the .sqn file
-    - `*.err`: unacceptable annotations - the NCBI discrepancy report
-    - `*.log`: logging output that Prokka produced during its run
-    - `*.txt`: statistics relating to the annotated features found
-    - `*.tsv`: tab-separated file of all features
+  - `category/`: indicates whether annotation files are of all contigs or `long`-only contigs (BGC subworkflow only)
+    - `<samplename>/`
+      - `*.gff`: annotation in GFF3 format, containing both sequences and annotations
+      - `*.gbk`: standard Genbank file derived from the master .gff
+      - `*.fna`: nucleotide FASTA file of the input contig sequences
+      - `*.faa`: protein FASTA file of the translated CDS sequences
+      - `*.ffn`: nucleotide FASTA file of all the prediction transcripts (CDS, rRNA, tRNA, tmRNA, misc_RNA)
+      - `*.sqn`: an ASN1 format "Sequin" file for submission to Genbank
+      - `*.fsa`: nucleotide FASTA file of the input contig sequences, used by "tbl2asn" to create the .sqn file
+      - `*.tbl`: feature Table file, used by "tbl2asn" to create the .sqn file
+      - `*.err`: unacceptable annotations - the NCBI discrepancy report
+      - `*.log`: logging output that Prokka produced during its run
+      - `*.txt`: statistics relating to the annotated features found
+      - `*.tsv`: tab-separated file of all features
 
 > Descriptions directly from the [Prokka documentation](https://github.com/tseemann/prokka#output-files)
 
@@ -190,17 +193,18 @@ Output Summaries:
 <summary>Output files</summary>
 
 - `bakta/`
-  - `<samplename>`
-    - `<samplename>.gff3`: annotations & sequences in GFF3 format
-    - `<samplename>.gbff`: annotations & sequences in (multi) GenBank format
-    - `<samplename>.ffn`: feature nucleotide sequences as FASTA
-    - `<samplename>.fna`: replicon/contig DNA sequences as FASTA
-    - `<samplename>.embl`: annotations & sequences in (multi) EMBL format
-    - `<samplename>.faa`: CDS/sORF amino acid sequences as FASTA
-    - `<samplename>_hypothetical.faa`: further information on hypothetical protein CDS as simple human readble tab separated values
-    - `<samplename>_hypothetical.tsv`: hypothetical protein CDS amino acid sequences as FASTA
-    - `<samplename>.tsv`: annotations as simple human readble TSV
-    - `<samplename>.txt`: summary in TXT format
+  - `category/`: indicates whether annotation files are of all contigs or `long`-only contigs (BGC only)
+    - `<samplename>`
+      - `<samplename>.gff3`: annotations & sequences in GFF3 format
+      - `<samplename>.gbff`: annotations & sequences in (multi) GenBank format
+      - `<samplename>.ffn`: feature nucleotide sequences as FASTA
+      - `<samplename>.fna`: replicon/contig DNA sequences as FASTA
+      - `<samplename>.embl`: annotations & sequences in (multi) EMBL format
+      - `<samplename>.faa`: CDS/sORF amino acid sequences as FASTA
+      - `<samplename>_hypothetical.faa`: further information on hypothetical protein CDS as simple human readble tab separated values
+      - `<samplename>_hypothetical.tsv`: hypothetical protein CDS amino acid sequences as FASTA
+      - `<samplename>.tsv`: annotations as simple human readble TSV
+      - `<samplename>.txt`: summary in TXT format
 
 > Descriptions taken from the [Bakta documentation](https://github.com/oschwengers/bakta#output).
 
@@ -355,7 +359,22 @@ Output Summaries:
 
 ### BGC detection tools
 
-[antiSMASH](#antismash), [deepBGC](#deepbgc), [GECCO](#gecco), [hmmsearch](#hmmsearch)
+[antiSMASH](#antismash), [deepBGC](#deepbgc), [GECCO](#gecco), [hmmsearch](#hmmsearch).
+
+Note that the BGC tools are run on a set of annotations generated on only long contigs (3000 bp or longer) by default. These specific filtered FASTA files are under `bgc/seqkit/`, and annotations files are under `annotation/<annotation_tool>/long/`, if the corresponding saving flags are specified (see [parameter docs](https://nf-co.re/funcscan/parameters)). However the same annotations _should_ also be annotation files in the sister `all/` directory.
+
+### Input contig QC
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `seqkit/`
+  - `<samplename>_long.fasta`: FASTA file containing contigs equal or longer than the threshold set by `--contig_qc_lengththreshold` used in BGC subworkflow
+  </details>
+
+[SeqKit](https://bioinf.shenwei.me/seqkit/) is a cross-platform and ultrafast toolkit for FASTA/Q file manipulation.
+
+Note that filtered FASTA is only used for BGC workflow for run-time optimisation and biological reasons. All contigs are otherwise screened in ARG/AMP workflows.
 
 #### antiSMASH
 

diff --git a/modules.json b/modules.json
@@ -37,7 +37,7 @@
                     },
                     "antismash/antismashlite": {
                         "branch": "master",
-                        "git_sha": "1c78323903f07d62bb57686914b567fb2018b1e4",
+                        "git_sha": "39c00d0873ff072b2901d8ae1f36abe1522c90f2",
                         "installed_by": ["modules"]
                     },
                     "antismash/antismashlitedownloaddatabases": {
@@ -82,7 +82,7 @@
                     },
                     "gecco/run": {
                         "branch": "master",
-                        "git_sha": "f6867fb2512d9a6c276af0c50d59fab6df46d7dd",
+                        "git_sha": "f9707f9499a90a46208873d23440e22ac8ad5ebc",
                         "installed_by": ["modules"]
                     },
                     "gunzip": {
@@ -182,7 +182,7 @@
                     },
                     "seqkit/seq": {
                         "branch": "master",
-                        "git_sha": "687ad41c14008d3d55cf7c2ffacebe6a057211a4",
+                        "git_sha": "2be41ca2cc780eca4293d1b0dd3850b0b7ac40a3",
                         "installed_by": ["modules"]
                     },
                     "tabix/bgzip": {

diff --git a/modules/nf-core/antismash/antismashlite/main.nf b/modules/nf-core/antismash/antismashlite/main.nf
diff --git a/modules/nf-core/gecco/run/main.nf b/modules/nf-core/gecco/run/main.nf
diff --git a/modules/nf-core/gecco/run/tests/main.nf.test.snap b/modules/nf-core/gecco/run/tests/main.nf.test.snap
diff --git a/modules/nf-core/seqkit/seq/environment.yml b/modules/nf-core/seqkit/seq/environment.yml