Skip to content

Commit

Permalink
Merge pull request #435 from jfy133/run-merging
Browse files Browse the repository at this point in the history
Adds support for run merging for multi-run samples
  • Loading branch information
jfy133 authored Jun 7, 2023
2 parents 3cfcc21 + 3f6d609 commit 5852d01
Show file tree
Hide file tree
Showing 15 changed files with 291 additions and 51 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ jobs:
test_ancient_dna,
test_adapterremoval,
test_binrefinement,
test_binning_entry,
]
steps:
- name: Free some space
Expand Down
1 change: 0 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#406](https://github.com/nf-core/mag/pull/406) - Fix CheckM database always downloading, regardless if CheckM is selected (by @jfy133)
- [#419](https://github.com/nf-core/mag/pull/419) - Fix bug with busco_clean parameter, where it is always activated (by @prototaxites)
- [#426](https://github.com/nf-core/mag/pull/426) - Fixed typo in help text for parameters `--host_genome` and `--host_fasta` (by @tillenglert)
- [#429](https://github.com/nf-core/mag/pull/429) - Replaced hardcoded CheckM database auto-download URL to a parameter (reported by @erikrikarddaniel, fix by @jfy133)
- [#434](https://github.com/nf-core/mag/pull/434) - Fix location of samplesheet for AWS full tests (reported by @Lfulcrum, fix by @jfy133)
- [#438](https://github.com/nf-core/mag/pull/438) - Fixed version inconsistency between conda and containers for GTDBTK_CLASSIFYWF (by @jfy133)
- [#439](https://github.com/nf-core/mag/pull/445) - Fix bug in assembly input (by @prototaxites)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

## Pipeline summary

By default, the pipeline currently performs the following: it supports both short and long reads, quality trims the reads and adapters with [fastp](https://github.com/OpenGene/fastp) and [Porechop](https://github.com/rrwick/Porechop), and performs basic QC with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
By default, the pipeline currently performs the following: it supports both short and long reads, quality trims the reads and adapters with [fastp](https://github.com/OpenGene/fastp) and [Porechop](https://github.com/rrwick/Porechop), and performs basic QC with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/), and merge multiple sequencing runs.

The pipeline then:

Expand Down
22 changes: 17 additions & 5 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ process {
mode: params.publish_dir_mode,
pattern: "*.html"
]
ext.prefix = { "${meta.id}_run${meta.run}_raw" }
tag = { "${meta.id}_run${meta.run}_raw" }
}

withName: FASTP {
Expand All @@ -50,6 +52,8 @@ process {
enabled: params.save_clipped_reads
]
]
ext.prefix = { "${meta.id}_run${meta.run}_fastp" }
tag = { "${meta.id}_run${meta.run}" }
}

withName: ADAPTERREMOVAL_PE {
Expand All @@ -72,7 +76,8 @@ process {
enabled: params.save_clipped_reads
]
]
ext.prefix = { "${meta.id}_ar2" }
ext.prefix = { "${meta.id}_run${meta.run}_ar2" }
tag = { "${meta.id}_run${meta.run}" }
}

withName: ADAPTERREMOVAL_SE {
Expand All @@ -87,11 +92,12 @@ process {
mode: params.publish_dir_mode,
pattern: "*.{settings}"
]
ext.prefix = { "${meta.id}_ar2" }
ext.prefix = { "${meta.id}_run${meta.run}_ar2" }
tag = { "${meta.id}_run${meta.run}" }
}

withName: BOWTIE2_PHIX_REMOVAL_ALIGN {
ext.prefix = { "${meta.id}.phix_removed" }
ext.prefix = { "${meta.id}_run${meta.run}_phix_removed" }
publishDir = [
[
path: { "${params.outdir}/QC_shortreads/remove_phix" },
Expand All @@ -105,12 +111,13 @@ process {
enabled: params.save_phixremoved_reads
]
]
tag = { "${meta.id}_run${meta.run}" }
}

withName: BOWTIE2_HOST_REMOVAL_ALIGN {
ext.args = params.host_removal_verysensitive ? "--very-sensitive" : "--sensitive"
ext.args2 = params.host_removal_save_ids ? "--host_removal_save_ids" : ''
ext.prefix = { "${meta.id}.host_removed" }
ext.prefix = { "${meta.id}_run${meta.run}_host_removed" }
publishDir = [
[
path: { "${params.outdir}/QC_shortreads/remove_host" },
Expand All @@ -124,16 +131,18 @@ process {
enabled: params.save_hostremoved_reads
]
]
tag = { "${meta.id}_run${meta.run}" }
}

withName: FASTQC_TRIMMED {
ext.args = '--quiet'
ext.prefix = { "${meta.id}.trimmed" }
ext.prefix = { "${meta.id}_run${meta.run}_trimmed" }
publishDir = [
path: { "${params.outdir}/QC_shortreads/fastqc" },
mode: params.publish_dir_mode,
pattern: "*.html"
]
tag = { "${meta.id}_run${meta.run}" }
}

withName: BBMAP_BBNORM {
Expand Down Expand Up @@ -165,6 +174,7 @@ process {
pattern: "*_porechop.fastq",
enabled: params.save_porechop_reads
]
ext.prefix = { "${meta.id}_run${meta.run}_trimmed" }
}

withName: FILTLONG {
Expand All @@ -174,6 +184,7 @@ process {
pattern: "*_lr_filtlong.fastq.gz",
enabled: params.save_filtlong_reads
]
ext.prefix = { "${meta.id}_run${meta.run}_lengthfiltered" }
}

withName: NANOLYSE {
Expand All @@ -190,6 +201,7 @@ process {
enabled: params.save_lambdaremoved_reads
]
]
ext.prefix = { "${meta.id}_run${meta.run}_lambdafiltered" }
}

withName: NANOPLOT_RAW {
Expand Down
2 changes: 1 addition & 1 deletion conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ params {
max_time = '6.h'

// Input data
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.csv'
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.multirun.csv'
centrifuge_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_cf.tar.gz"
kraken2_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_kraken.tgz"
skip_krona = true
Expand Down
2 changes: 1 addition & 1 deletion conf/test_full.config
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ params {
// Input data for full size test
// hg19 reference with highly conserved and low-complexity regions masked by Brian Bushnell
host_fasta = "s3://ngi-igenomes/test-data/mag/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz"
input = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.full.csv"
input = "s3://ngi-igenomes/test-data/mag/samplesheets/samplesheet.full.csv"

centrifuge_db = "s3://ngi-igenomes/test-data/mag/p_compressed+h+v.tar.gz"
kraken2_db = "s3://ngi-igenomes/test-data/mag/minikraken_8GB_202003.tgz"
Expand Down
43 changes: 43 additions & 0 deletions conf/test_nothing.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Runs input data but skipping all possible steps to allow for a fast testing
profile for input checks etc.
Use as follows:
nextflow run nf-core/mag -profile test_nothing,<docker/singularity> --outdir <OUTDIR>
----------------------------------------------------------------------------------------
*/

params {
config_profile_name = 'Test profile'
config_profile_description = 'Minimal test dataset to check pipeline function'

// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = '6.GB'
max_time = '6.h'

// Input data
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.csv'
centrifuge_db = null
kraken2_db = null
skip_krona = true
skip_clipping = true
skip_adapter_trimming = true
skip_spades = true
skip_spadeshybrid = true
skip_megahit = true
skip_quast = true
skip_prodigal = true
skip_binning = true
skip_metabat2 = true
skip_maxbin2 = true
skip_concoct = true
skip_prokka = true
skip_binqc = true
gtdb = false
skip_concoct = true
}
23 changes: 17 additions & 6 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,13 @@ Please note the following additional requirements:
- When using the pipeline with paired end data, the path must use `{1,2}` notation to specify read pairs
- To run single-end data you must additionally specify `--single_end`
- If left unspecified, a default pattern is used: `data/*{1,2}.fastq.gz`
- Sample name and run combinations must be unique

### Samplesheet input file

Alternatively, to assign different groups or to include long reads for hybrid assembly with metaSPAdes, you can specify a CSV samplesheet input file that contains the paths to your FASTQ files and additional metadata.
Alternatively, to assign different groups or to include long reads for hybrid assembly with metaSPAdes, you can specify a CSV samplesheet input file that contains the paths to your FASTQ files and additional metadata. Furthermore when a `run` column is present, the pipeline will also run perform run- or lane-wise concatenation, for cases where you may have a sample or library sequenced with the same sequencing configuration across multiple runs. The optional run merging happens after short read QC (adapter clipping, host/PhiX removal etc.), and prior to normalisation, taxonomic profiling, and assembly.

This CSV file should contain the following columns:
At a minimum CSV file should contain the following columns:

`sample,group,short_reads_1,short_reads_2,long_reads`

Expand All @@ -53,12 +54,22 @@ sample1,0,data/sample1.fastq.gz,,
sample2,0,data/sample2.fastq.gz,,
```

or to additionally to perform run merging of two runs of sample1:

```bash
sample,run,group,short_reads_1,short_reads_2,long_reads
sample1,1,0,data/sample1_R1.fastq.gz,data/sample1_R2.fastq.gz,data/sample1.fastq.gz
sample1,2,0,data/sample1_R1.fastq.gz,data/sample1_R2.fastq.gz,data/sample1.fastq.gz
sample2,0,0,data/sample2_R1.fastq.gz,data/sample2_R2.fastq.gz,data/sample2.fastq.gz
sample3,1,0,data/sample3_R1.fastq.gz,data/sample3_R2.fastq.gz,
```

Please note the following requirements:

- 5 comma-seperated columns
- a minimum 5 of comma-seperated columns
- Valid file extension: `.csv`
- Must contain the header `sample,group,short_reads_1,short_reads_2,long_reads`
- Sample IDs must be unique
- Must contain the header `sample,group,short_reads_1,short_reads_2,long_reads` (where `run` can be optionally added)
- Run IDs must be unique within a multi-run sample. A sample with multiple runs will be automatically concatenated.
- FastQ files must be compressed (`.fastq.gz`, `.fq.gz`)
- `long_reads` can only be provided in combination with paired-end short read data
- Within one samplesheet either only single-end or only paired-end reads can be specified
Expand Down Expand Up @@ -105,7 +116,7 @@ group-1,1,MEGAHIT,MEGAHIT-group-1.contigs.fa.gz
group-1,1,SPAdes,SPAdes-group-1.contigs.fasta.gz
```

When supplying pre-computed assemblies, reads **must** also be provided in the CSV input format to `--input`, and should be the reads used to build the assemblies. As long reads are only used for assembly, any long read fastq files listed in the reads CSV are ignored.
When supplying pre-computed assemblies, reads **must** also be provided in the CSV input format to `--input`, and should be the reads used to build the assemblies, i.e., adapter-removed, run-merged etc.. Preprocessing steps will not be ran on raw reads when pre-computed assemblies are supplied. As long reads are only used for assembly, any long read fastq files listed in the reads CSV are ignored.

## Running the pipeline

Expand Down
5 changes: 5 additions & 0 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@
"git_sha": "fa12afdf5874c1d11e4a20efe81c97935e8eea24",
"installed_by": ["modules"]
},
"cat/fastq": {
"branch": "master",
"git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e",
"installed_by": ["modules"]
},
"checkm/lineagewf": {
"branch": "master",
"git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
Expand Down
80 changes: 80 additions & 0 deletions modules/nf-core/cat/fastq/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

40 changes: 40 additions & 0 deletions modules/nf-core/cat/fastq/meta.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ profiles {
test_binrefinement { includeConfig 'conf/test_binrefinement.config' }
test_no_clipping { includeConfig 'conf/test_no_clipping.config' }
test_bbnorm { includeConfig 'conf/test_bbnorm.config' }
test_nothing { includeConfig 'conf/test_nothing.config' }
}


Expand Down
2 changes: 2 additions & 0 deletions subworkflows/local/binning_preparation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,15 @@ workflow BINNING_PREPARATION {
.map { meta, assembly, index -> [ meta.group, meta, assembly, index ] }
.combine(ch_reads_bowtie2, by: 0)
.map { group, assembly_meta, assembly, index, reads_meta, reads -> [ assembly_meta, assembly, index, reads_meta, reads ] }

} else {
// combine assemblies (not co-assembled) with reads from own sample
ch_reads_bowtie2 = reads.map{ meta, reads -> [ meta.id, meta, reads ] }
ch_bowtie2_input = BOWTIE2_ASSEMBLY_BUILD.out.assembly_index
.map { meta, assembly, index -> [ meta.id, meta, assembly, index ] }
.combine(ch_reads_bowtie2, by: 0)
.map { id, assembly_meta, assembly, index, reads_meta, reads -> [ assembly_meta, assembly, index, reads_meta, reads ] }

}

BOWTIE2_ASSEMBLY_ALIGN ( ch_bowtie2_input )
Expand Down
Loading

0 comments on commit 5852d01

Please sign in to comment.