update to revieweres comments

nf-core · Mar 23, 2024 · 0bc085c · 0bc085c
1 parent 44c20f1
commit 0bc085c
Show file tree

Hide file tree

Showing 21 changed files with 412 additions and 217 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -77,3 +77,34 @@ jobs:
       - name: Run pipeline with test data (BGC workflow)
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test_bgc,docker --outdir ./results ${{ matrix.parameters }} --bgc_skip_deepbgc
+
+  test_taxonomy:
+    name: Run pipeline with test data (AMP, ARG and BGC taxonomy workflows)
+    # Only run on push if this is the nf-core dev branch (merged PRs)
+    if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/funcscan') }}"
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        NXF_VER:
+          - "23.04.0"
+          - "latest-everything"
+        parameters:
+          - "--annotation_tool prodigal"
+          - "--annotation_tool prokka"
+          - "--annotation_tool bakta --annotation_bakta_db_downloadtype light"
+
+    steps:
+      - name: Check out pipeline code
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4
+
+      - name: Install Nextflow
+        uses: nf-core/setup-nextflow@v1
+        with:
+          version: "${{ matrix.NXF_VER }}"
+
+      - name: Disk space cleanup
+        uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
+
+      - name: Run pipeline with test data (AMP, ARG and BGC taxonomy workflows)
+        run: |
+          nextflow run ${GITHUB_WORKSPACE} -profile test_taxonomy,docker --outdir ./results ${{ matrix.parameters }}
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,11 +15,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Fixed`
 
-- [#343](https://github.com/nf-core/funcscan/pull/343) Standardized the resulting workflow summary tables to always start with 'sample_id\tcontig_id\t..'. Reformated the output of 'hamronization summarize' module. (by @darcy220606)
-- [#348](https://github.com/nf-core/funcscan/pull/348) Updated samplesheet for pipeline tests to 'samplesheet_reduced.csv' with smaller datasets to reduce resource consumption. Updated prodigal module to fix pigz issue. Removed `tests/` from `.gitignore` (by
-  @darcy220606)
-
-### `Dependencies`
+- [#343](https://github.com/nf-core/funcscan/pull/343) Standardized the resulting workflow summary tables to always start with 'sample_id\tcontig_id\t..'. Reformatted the output of `hamronization/summarize` module. (by @darcy220606)
+- [#348](https://github.com/nf-core/funcscan/pull/348) Updated samplesheet for pipeline tests to 'samplesheet_reduced.csv' with smaller datasets to reduce resource consumption. Updated prodigal module to fix pigz issue. Removed `tests/` from `.gitignore`. (by @darcy220606)
 
 | Tool    | Previous version | New version |
 | ------- | ---------------- | ----------- |

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -92,7 +92,7 @@
 
 - [MMseqs2](https://doi.org/10.1093bioinformatics/btab184)
 
-  > Mirdita M., Steinegger M., Breitwieser F., Söding J., Levy Karin E. (2021). Fastand sensitive taxonomic assignment to metagenomic contigs, Bioinformatics, 37(18),3029–3031. [DOI: 10.1093/bioinformatics/btab184](https://doi.org/10.1093bioinformatics/btab184)
+  > Mirdita, M., Steinegger, M., Breitwieser, F., Söding, J., Levy Karin, E. (2021). Fast and sensitive taxonomic assignment to metagenomic contigs. Bioinformatics, 37(18),3029–3031. [DOI: 10.1093/bioinformatics/btab184](https://doi.org/10.1093/bioinformatics/btab184)
 
 ## Software packaging/containerisation tools
 

diff --git a/README.md b/README.md
@@ -30,7 +30,7 @@ The nf-core/funcscan AWS full test dataset are contigs generated by the MGnify s
 
 ## Pipeline summary
 
-1. Taxonomic classification of contigs from **of prokaryotic origin** with [`MMseqs2`](https://github.com/soedinglab/MMseqs2).
+1. Taxonomic classification of contigs of **prokaryotic origin** with [`MMseqs2`](https://github.com/soedinglab/MMseqs2)
 2. Annotation of assembled prokaryotic contigs with [`Prodigal`](https://github.com/hyattpd/Prodigal), [`Pyrodigal`](https://github.com/althonos/pyrodigal), [`Prokka`](https://github.com/tseemann/prokka), or [`Bakta`](https://github.com/oschwengers/bakta)
 3. Screening contigs for antimicrobial peptide-like sequences with [`ampir`](https://cran.r-project.org/web/packages/ampir/index.html), [`Macrel`](https://github.com/BigDataBiology/macrel), [`HMMER`](http://hmmer.org/), [`AMPlify`](https://github.com/bcgsc/AMPlify)
 4. Screening contigs for antibiotic resistant gene-like sequences with [`ABRicate`](https://github.com/tseemann/abricate), [`AMRFinderPlus`](https://github.com/ncbi/amr), [`fARGene`](https://github.com/fannyhb/fargene), [`RGI`](https://card.mcmaster.ca/analyze/rgi), [`DeepARG`](https://bench.cs.vt.edu/deeparg)

diff --git a/bin/merge_taxonomy.py b/bin/merge_taxonomy.py
@@ -35,7 +35,7 @@
 subparsers = parser.add_subparsers(required=True)
 
 #########################################
-# SUBPARSERS : AMPCOMBI
+# SUBPARSER: AMPCOMBI
 #########################################
 ampcombi_parser = subparsers.add_parser('ampcombi_taxa')
 
@@ -44,7 +44,7 @@
 ampcombi_parser.add_argument("--taxonomy", dest="taxa1", nargs='+', help="Enter the list of taxonomy files for all samples. ")
 
 #########################################
-# SUBPARSERS : COMBGC
+# SUBPARSER: COMBGC
 #########################################
 combgc_parser = subparsers.add_parser('combgc_taxa')
 
@@ -53,7 +53,7 @@
 combgc_parser.add_argument("--taxonomy", dest="taxa2", nargs='+', help="Enter the list of taxonomy files for all samples. ")
 
 #########################################
-# SUBPARSERS : HAMRONIZATION
+# SUBPARSER: HAMRONIZATION
 #########################################
 hamronization_parser = subparsers.add_parser('hamronization_taxa')
 
@@ -74,15 +74,14 @@ def reformat_mmseqs_taxonomy(mmseqs_taxonomy):
         lineage = str(row['mmseqs_lineage_contig'])
         if 'Eukaryota' in lineage or 'root' in lineage:
             mmseqs2_df.at[i, 'mmseqs_lineage_contig'] = np.nan
-            #mmseqs2_df['mmseqs_lineage_contig'].unique()
     # insert the sample name in the first column according to the file basename
     file_basename = os.path.basename(mmseqs_taxonomy)
     filename = os.path.splitext(file_basename)[0]
     mmseqs2_df.insert(0, 'sample_id', filename)
     return mmseqs2_df
 
 #########################################
-# FUNCTION : AMPCOMBI
+# FUNCTION: AMPCOMBI
 #########################################
 def ampcombi_taxa(args):
     merged_df = pd.DataFrame()
@@ -128,7 +127,7 @@ def ampcombi_taxa(args):
     merged_df.to_csv('ampcombi_complete_summary_taxonomy.tsv', sep='\t', index=False)
 
 #########################################
-# FUNCTION : COMBGC
+# FUNCTION: COMBGC
 #########################################
 def combgc_taxa(args):
     merged_df = pd.DataFrame()
@@ -172,7 +171,7 @@ def combgc_taxa(args):
     merged_df.to_csv('combgc_complete_summary_taxonomy.tsv', sep='\t', index=False)
 
 #########################################
-# FUNCTION : HAMRONIZATION
+# FUNCTION: HAMRONIZATION
 #########################################
 def hamronization_taxa(args):
     merged_df = pd.DataFrame()
@@ -220,7 +219,7 @@ def hamronization_taxa(args):
     merged_df.to_csv('hamronization_complete_summary_taxonomy.tsv', sep='\t', index=False)
 
 #########################################
-# SUBPARSERS : DEFAULT
+# SUBPARSERS: DEFAULT
 #########################################
 ampcombi_parser.set_defaults(func=ampcombi_taxa)
 combgc_parser.set_defaults(func=combgc_taxa)

diff --git a/conf/modules.config b/conf/modules.config
@@ -46,49 +46,50 @@ process {
 
     withName: MMSEQS_DATABASES {
         publishDir = [
-            path: { "${params.outdir}/databases/" },
+            path: { "${params.outdir}/databases/mmseqs/" }, // dir==mmseqs_database/
             mode: params.publish_dir_mode,
             enabled: params.save_databases,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
         ext.args = [
-            params.taxonomy_mmseqs_databases_savetmp ? "" : "--remove-tmp-files" ,
+            params.mmseqs_databases_savetmp ? "" : "--remove-tmp-files" ,
         ].join(' ').trim()
     }
 
     withName: MMSEQS_CREATEDB {
         publishDir = [
-            path: { "${params.outdir}/taxonomy/mmseqs_createdb/" },
+            path: { "${params.outdir}/databases/mmseqs/mmseqs_createdb/" },
             mode: params.publish_dir_mode,
-            enabled: params.taxonomy_mmseqs_save_intermedfiles,
+            enabled: params.save_databases,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
 
     withName: MMSEQS_TAXONOMY {
         publishDir = [
-            path: { "${params.outdir}/taxonomy/mmseqs_taxonomy/" },
+            path: { "${params.outdir}/databases/mmseqs/mmseqs_taxonomy/" },
             mode: params.publish_dir_mode,
-            enabled: params.taxonomy_mmseqs_save_intermedfiles,
+            enabled: params.save_databases,
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
         ext.args = [
-            params.taxonomy_mmseqs_taxonomy_savetmp ? "" : "--remove-tmp-files",
-            "--search-type ${params.taxonomy_mmseqs_taxonomy_searchtype}",
-            "--lca-ranks ${params.taxonomy_mmseqs_taxonomy_lcaranks}",
-            "--tax-lineage ${params.taxonomy_mmseqs_taxonomy_taxlineage}",
-            "-s ${params.taxonomy_mmseqs_taxonomy_sensitivity}",
-            "--orf-filter-s ${params.taxonomy_mmseqs_taxonomy_orffilters}",
-            "--lca-mode ${params.taxonomy_mmseqs_taxonomy_lcamode}",
-            "--vote-mode ${params.taxonomy_mmseqs_taxonomy_votemode}",
+            params.mmseqs_taxonomy_savetmp ? "" : "--remove-tmp-files",
+            "--search-type ${params.mmseqs_taxonomy_searchtype}",
+            "--lca-ranks ${params.mmseqs_taxonomy_lcaranks}",
+            "--tax-lineage ${params.mmseqs_taxonomy_taxlineage}",
+            "-s ${params.mmseqs_taxonomy_sensitivity}",
+            "--orf-filter-s ${params.mmseqs_taxonomy_orffilters}",
+            "--lca-mode ${params.mmseqs_taxonomy_lcamode}",
+            "--vote-mode ${params.mmseqs_taxonomy_votemode}",
         ].join(' ').trim()
     }
 
     withName: MMSEQS_CREATETSV {
         publishDir = [
-            path: { "${params.outdir}/taxonomy/mmseqs_createtsv/${meta.id}" },
+            path: { "${params.outdir}/taxonomic_classification/mmseqs_createtsv/${meta.id}/" },
             mode: params.publish_dir_mode,
-            enabled: params.taxonomy_mmseqs_save_intermedfiles,
+            enabled: params.run_taxonomic_classification,
+            pattern: "*.tsv",
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
         ]
     }
@@ -499,11 +500,19 @@ process {
         publishDir = [
             path: { "${params.outdir}/reports/hamronization_summarize" },
             mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+            saveAs: { (params.run_taxonomic_classification == false) ? it : null }
         ]
     }
 
     withName: MERGE_TAXONOMY_HAMRONIZATION {
+        publishDir = [
+            path: { "${params.outdir}/reports/hamronization_summarize" },
+            mode: params.publish_dir_mode,
+            saveAs: { _ -> null } //do not save the file
+        ]
+    }
+
+    withName: ARG_TABIX_BGZIP {
         publishDir = [
             path: { "${params.outdir}/reports/hamronization_summarize" },
             mode: params.publish_dir_mode,
@@ -525,12 +534,12 @@ process {
     withName: MERGE_TAXONOMY_AMPCOMBI {
         publishDir = [
             path: { "${params.outdir}/reports/ampcombi" },
-            mode: params.taxonomy_mmseqs_save_intermedfiles,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+            mode: params.publish_dir_mode,
+            saveAs: { _ -> null } //do not save the file
         ]
     }
 
-    withName: TABIX_BGZIP {
+    withName: AMP_TABIX_BGZIP {
         publishDir = [
             path: { "${params.outdir}/reports/ampcombi" },
             mode: params.publish_dir_mode,
@@ -547,6 +556,14 @@ process {
     }
 
     withName: MERGE_TAXONOMY_COMBGC {
+        publishDir = [
+            path: { "${params.outdir}/reports/combgc" },
+            mode: params.publish_dir_mode,
+            saveAs: { _ -> null } //do not save the file
+        ]
+    }
+
+    withName: BGC_TABIX_BGZIP {
         publishDir = [
             path: { "${params.outdir}/reports/combgc" },
             mode: params.publish_dir_mode,

diff --git a/conf/test.config b/conf/test.config
@@ -16,17 +16,17 @@ params {
 
     // Limit resources so that this can run on GitHub Actions
     max_cpus   = 2
-    max_memory = '9.GB'
+    max_memory = '8.GB'
     max_time   = '6.h'
 
     // Input data
     input                = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_reduced.csv'
     amp_hmmsearch_models = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm'
 
-    annotation_tool                    = 'prodigal'
+    annotation_tool      = 'prodigal'
 
-    run_arg_screening                  = true
-    arg_fargene_hmmmodel               = 'class_a,class_b_1_2'
+    run_arg_screening    = true
+    arg_fargene_hmmmodel = 'class_a,class_b_1_2'
 
-    run_amp_screening                  = true
+    run_amp_screening    = true
 }
diff --git a/conf/test_bgc.config b/conf/test_bgc.config
@@ -16,7 +16,7 @@ params {
 
     // Limit resources so that this can run on GitHub Actions
     max_cpus   = 2
-    max_memory = '9.GB'
+    max_memory = '8.GB'
     max_time   = '6.h'
 
     // Input data

diff --git a/conf/test_taxonomy.config b/conf/test_taxonomy.config
@@ -0,0 +1,38 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/funcscan -profile test_taxonomy,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Taxonomic classification test profile'
+    config_profile_description = 'Minimal test dataset to check taxonomic classification workflow function'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '8.GB'
+    max_time   = '6.h'
+
+    // Input data
+    input                        = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_reduced.csv'
+    bgc_hmmsearch_models         = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm'
+    amp_hmmsearch_models         = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm'
+
+    run_taxonomic_classification = true
+    annotation_tool              = 'prodigal'
+
+    run_arg_screening            = true
+    arg_skip_deeparg             = true
+    arg_skip_amrfinderplus       = true
+
+    run_amp_screening            = true
+
+    run_bgc_screening            = true
+    bgc_skip_deepbgc             = true
+}