Merge pull request #435 from jfy133/run-merging

Adds support for run merging for multi-run samples
nf-core · Jun 7, 2023 · 5852d01 · 5852d01
2 parents 3cfcc21 + 3f6d609
commit 5852d01
Show file tree

Hide file tree

Showing 15 changed files with 291 additions and 51 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -61,6 +61,7 @@ jobs:
             test_ancient_dna,
             test_adapterremoval,
             test_binrefinement,
+            test_binning_entry,
           ]
     steps:
       - name: Free some space

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -24,7 +24,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#406](https://github.com/nf-core/mag/pull/406) - Fix CheckM database always downloading, regardless if CheckM is selected (by @jfy133)
 - [#419](https://github.com/nf-core/mag/pull/419) - Fix bug with busco_clean parameter, where it is always activated (by @prototaxites)
 - [#426](https://github.com/nf-core/mag/pull/426) - Fixed typo in help text for parameters `--host_genome` and `--host_fasta` (by @tillenglert)
-- [#429](https://github.com/nf-core/mag/pull/429) - Replaced hardcoded CheckM database auto-download URL to a parameter (reported by @erikrikarddaniel, fix by @jfy133)
 - [#434](https://github.com/nf-core/mag/pull/434) - Fix location of samplesheet for AWS full tests (reported by @Lfulcrum, fix by @jfy133)
 - [#438](https://github.com/nf-core/mag/pull/438) - Fixed version inconsistency between conda and containers for GTDBTK_CLASSIFYWF (by @jfy133)
 - [#439](https://github.com/nf-core/mag/pull/445) - Fix bug in assembly input (by @prototaxites)

diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@
 
 ## Pipeline summary
 
-By default, the pipeline currently performs the following: it supports both short and long reads, quality trims the reads and adapters with [fastp](https://github.com/OpenGene/fastp) and [Porechop](https://github.com/rrwick/Porechop), and performs basic QC with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
+By default, the pipeline currently performs the following: it supports both short and long reads, quality trims the reads and adapters with [fastp](https://github.com/OpenGene/fastp) and [Porechop](https://github.com/rrwick/Porechop), and performs basic QC with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/), and merge multiple sequencing runs.
 
 The pipeline then:
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -27,6 +27,8 @@ process {
             mode: params.publish_dir_mode,
             pattern: "*.html"
         ]
+        ext.prefix = { "${meta.id}_run${meta.run}_raw" }
+        tag = { "${meta.id}_run${meta.run}_raw" }
     }
 
     withName: FASTP {
@@ -50,6 +52,8 @@ process {
                 enabled: params.save_clipped_reads
             ]
         ]
+        ext.prefix = { "${meta.id}_run${meta.run}_fastp" }
+        tag = { "${meta.id}_run${meta.run}" }
     }
 
     withName: ADAPTERREMOVAL_PE {
@@ -72,7 +76,8 @@ process {
                 enabled: params.save_clipped_reads
             ]
         ]
-        ext.prefix = { "${meta.id}_ar2" }
+        ext.prefix = { "${meta.id}_run${meta.run}_ar2" }
+        tag = { "${meta.id}_run${meta.run}" }
     }
 
     withName: ADAPTERREMOVAL_SE {
@@ -87,11 +92,12 @@ process {
             mode: params.publish_dir_mode,
             pattern: "*.{settings}"
         ]
-        ext.prefix = { "${meta.id}_ar2" }
+        ext.prefix = { "${meta.id}_run${meta.run}_ar2" }
+        tag = { "${meta.id}_run${meta.run}" }
     }
 
     withName: BOWTIE2_PHIX_REMOVAL_ALIGN {
-        ext.prefix = { "${meta.id}.phix_removed" }
+        ext.prefix = { "${meta.id}_run${meta.run}_phix_removed" }
         publishDir = [
             [
                 path: { "${params.outdir}/QC_shortreads/remove_phix" },
@@ -105,12 +111,13 @@ process {
                 enabled: params.save_phixremoved_reads
             ]
         ]
+        tag = { "${meta.id}_run${meta.run}" }
     }
 
     withName: BOWTIE2_HOST_REMOVAL_ALIGN {
         ext.args = params.host_removal_verysensitive ? "--very-sensitive" : "--sensitive"
         ext.args2 = params.host_removal_save_ids ? "--host_removal_save_ids" : ''
-        ext.prefix = { "${meta.id}.host_removed" }
+        ext.prefix = { "${meta.id}_run${meta.run}_host_removed" }
         publishDir = [
             [
                 path: { "${params.outdir}/QC_shortreads/remove_host" },
@@ -124,16 +131,18 @@ process {
                 enabled: params.save_hostremoved_reads
             ]
         ]
+        tag = { "${meta.id}_run${meta.run}" }
     }
 
     withName: FASTQC_TRIMMED {
         ext.args = '--quiet'
-        ext.prefix = { "${meta.id}.trimmed" }
+        ext.prefix = { "${meta.id}_run${meta.run}_trimmed" }
         publishDir = [
             path: { "${params.outdir}/QC_shortreads/fastqc" },
             mode: params.publish_dir_mode,
             pattern: "*.html"
         ]
+        tag = { "${meta.id}_run${meta.run}" }
     }
 
     withName: BBMAP_BBNORM {
@@ -165,6 +174,7 @@ process {
                 pattern: "*_porechop.fastq",
                 enabled: params.save_porechop_reads
             ]
+        ext.prefix = { "${meta.id}_run${meta.run}_trimmed" }
     }
 
     withName: FILTLONG {
@@ -174,6 +184,7 @@ process {
                 pattern: "*_lr_filtlong.fastq.gz",
                 enabled: params.save_filtlong_reads
             ]
+        ext.prefix = { "${meta.id}_run${meta.run}_lengthfiltered" }
     }
 
     withName: NANOLYSE {
@@ -190,6 +201,7 @@ process {
                 enabled: params.save_lambdaremoved_reads
             ]
         ]
+        ext.prefix = { "${meta.id}_run${meta.run}_lambdafiltered" }
     }
 
     withName: NANOPLOT_RAW {

diff --git a/conf/test.config b/conf/test.config
@@ -20,7 +20,7 @@ params {
     max_time   = '6.h'
 
     // Input data
-    input                         = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.csv'
+    input                         = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.multirun.csv'
     centrifuge_db                 = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_cf.tar.gz"
     kraken2_db                    = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_kraken.tgz"
     skip_krona                    = true

diff --git a/conf/test_full.config b/conf/test_full.config
@@ -19,7 +19,7 @@ params {
     // Input data for full size test
     // hg19 reference with highly conserved and low-complexity regions masked by Brian Bushnell
     host_fasta    = "s3://ngi-igenomes/test-data/mag/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz"
-    input         = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.full.csv"
+    input         = "s3://ngi-igenomes/test-data/mag/samplesheets/samplesheet.full.csv"
 
     centrifuge_db = "s3://ngi-igenomes/test-data/mag/p_compressed+h+v.tar.gz"
     kraken2_db    = "s3://ngi-igenomes/test-data/mag/minikraken_8GB_202003.tgz"

diff --git a/conf/test_nothing.config b/conf/test_nothing.config
@@ -0,0 +1,43 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Runs input data but skipping all possible steps to allow for a fast testing
+    profile for input checks etc.
+
+    Use as follows:
+        nextflow run nf-core/mag -profile test_nothing,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset to check pipeline function'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+    input                         = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.csv'
+    centrifuge_db                 = null
+    kraken2_db                    = null
+    skip_krona                    = true
+    skip_clipping                 = true
+    skip_adapter_trimming         = true
+    skip_spades                   = true
+    skip_spadeshybrid             = true
+    skip_megahit                  = true
+    skip_quast                    = true
+    skip_prodigal                 = true
+    skip_binning                  = true
+    skip_metabat2                 = true
+    skip_maxbin2                  = true
+    skip_concoct                  = true
+    skip_prokka                   = true
+    skip_binqc                    = true
+    gtdb                          = false
+    skip_concoct                  = true
+}
diff --git a/docs/usage.md b/docs/usage.md
@@ -27,12 +27,13 @@ Please note the following additional requirements:
 - When using the pipeline with paired end data, the path must use `{1,2}` notation to specify read pairs
 - To run single-end data you must additionally specify `--single_end`
 - If left unspecified, a default pattern is used: `data/*{1,2}.fastq.gz`
+- Sample name and run combinations must be unique
 
 ### Samplesheet input file
 
-Alternatively, to assign different groups or to include long reads for hybrid assembly with metaSPAdes, you can specify a CSV samplesheet input file that contains the paths to your FASTQ files and additional metadata.
+Alternatively, to assign different groups or to include long reads for hybrid assembly with metaSPAdes, you can specify a CSV samplesheet input file that contains the paths to your FASTQ files and additional metadata. Furthermore when a `run` column is present, the pipeline will also run perform run- or lane-wise concatenation, for cases where you may have a sample or library sequenced with the same sequencing configuration across multiple runs. The optional run merging happens after short read QC (adapter clipping, host/PhiX removal etc.), and prior to normalisation, taxonomic profiling, and assembly.
 
-This CSV file should contain the following columns:
+At a minimum CSV file should contain the following columns:
 
 `sample,group,short_reads_1,short_reads_2,long_reads`
 
@@ -53,12 +54,22 @@ sample1,0,data/sample1.fastq.gz,,
 sample2,0,data/sample2.fastq.gz,,
 ```
 
+or to additionally to perform run merging of two runs of sample1:
+
+```bash
+sample,run,group,short_reads_1,short_reads_2,long_reads
+sample1,1,0,data/sample1_R1.fastq.gz,data/sample1_R2.fastq.gz,data/sample1.fastq.gz
+sample1,2,0,data/sample1_R1.fastq.gz,data/sample1_R2.fastq.gz,data/sample1.fastq.gz
+sample2,0,0,data/sample2_R1.fastq.gz,data/sample2_R2.fastq.gz,data/sample2.fastq.gz
+sample3,1,0,data/sample3_R1.fastq.gz,data/sample3_R2.fastq.gz,
+```
+
 Please note the following requirements:
 
-- 5 comma-seperated columns
+- a minimum 5 of comma-seperated columns
 - Valid file extension: `.csv`
-- Must contain the header `sample,group,short_reads_1,short_reads_2,long_reads`
-- Sample IDs must be unique
+- Must contain the header `sample,group,short_reads_1,short_reads_2,long_reads` (where `run` can be optionally added)
+- Run IDs must be unique within a multi-run sample. A sample with multiple runs will be automatically concatenated.
 - FastQ files must be compressed (`.fastq.gz`, `.fq.gz`)
 - `long_reads` can only be provided in combination with paired-end short read data
 - Within one samplesheet either only single-end or only paired-end reads can be specified
@@ -105,7 +116,7 @@ group-1,1,MEGAHIT,MEGAHIT-group-1.contigs.fa.gz
 group-1,1,SPAdes,SPAdes-group-1.contigs.fasta.gz
 ```
 
-When supplying pre-computed assemblies, reads **must** also be provided in the CSV input format to `--input`, and should be the reads used to build the assemblies. As long reads are only used for assembly, any long read fastq files listed in the reads CSV are ignored.
+When supplying pre-computed assemblies, reads **must** also be provided in the CSV input format to `--input`, and should be the reads used to build the assemblies, i.e., adapter-removed, run-merged etc.. Preprocessing steps will not be ran on raw reads when pre-computed assemblies are supplied. As long reads are only used for assembly, any long read fastq files listed in the reads CSV are ignored.
 
 ## Running the pipeline
 

diff --git a/modules.json b/modules.json
@@ -36,6 +36,11 @@
                         "git_sha": "fa12afdf5874c1d11e4a20efe81c97935e8eea24",
                         "installed_by": ["modules"]
                     },
+                    "cat/fastq": {
+                        "branch": "master",
+                        "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e",
+                        "installed_by": ["modules"]
+                    },
                     "checkm/lineagewf": {
                         "branch": "master",
                         "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",

diff --git a/modules/nf-core/cat/fastq/main.nf b/modules/nf-core/cat/fastq/main.nf
diff --git a/modules/nf-core/cat/fastq/meta.yml b/modules/nf-core/cat/fastq/meta.yml
diff --git a/nextflow.config b/nextflow.config
@@ -293,6 +293,7 @@ profiles {
     test_binrefinement  { includeConfig 'conf/test_binrefinement.config'  }
     test_no_clipping    { includeConfig 'conf/test_no_clipping.config'    }
     test_bbnorm         { includeConfig 'conf/test_bbnorm.config'         }
+    test_nothing        { includeConfig 'conf/test_nothing.config'        }
 }
 
 

diff --git a/subworkflows/local/binning_preparation.nf b/subworkflows/local/binning_preparation.nf
@@ -26,13 +26,15 @@ workflow BINNING_PREPARATION {
             .map { meta, assembly, index -> [ meta.group, meta, assembly, index ] }
             .combine(ch_reads_bowtie2, by: 0)
             .map { group, assembly_meta, assembly, index, reads_meta, reads -> [ assembly_meta, assembly, index, reads_meta, reads ] }
+
     } else {
         // combine assemblies (not co-assembled) with reads from own sample
         ch_reads_bowtie2 = reads.map{ meta, reads -> [ meta.id, meta, reads ] }
         ch_bowtie2_input = BOWTIE2_ASSEMBLY_BUILD.out.assembly_index
             .map { meta, assembly, index -> [ meta.id, meta, assembly, index ] }
             .combine(ch_reads_bowtie2, by: 0)
             .map { id, assembly_meta, assembly, index, reads_meta, reads -> [ assembly_meta, assembly, index, reads_meta, reads ] }
+
     }
 
     BOWTIE2_ASSEMBLY_ALIGN ( ch_bowtie2_input )