From 21636c6d5bbff5093c73e9094133310da3d2d265 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 23 Mar 2022 08:45:48 +0100 Subject: [PATCH 1/6] Add AdapterRemoval2 as an alternative trimmer --- CHANGELOG.md | 1 + modules.json | 3 + modules/local/multiqc.nf | 2 +- .../nf-core/modules/adapterremoval/main.nf | 70 +++++++++++++++ .../nf-core/modules/adapterremoval/meta.yml | 90 +++++++++++++++++++ nextflow.config | 3 +- subworkflows/local/binning.nf | 2 +- workflows/mag.nf | 75 ++++++++++++---- 8 files changed, 226 insertions(+), 20 deletions(-) create mode 100644 modules/nf-core/modules/adapterremoval/main.nf create mode 100644 modules/nf-core/modules/adapterremoval/meta.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index 96f94b7e..2b583685 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#263](https://github.com/nf-core/mag/pull/263) - Restructure binning subworkflow in preparation for aDNA workflow and extended binning - [#247](https://github.com/nf-core/mag/pull/247) - Add ancient DNA subworkflow - [#263](https://github.com/nf-core/mag/pull/263) - Add MaxBin2 as second contig binning tool +- [#264](https://github.com/nf-core/mag/issues/284) - Add AdapterRemoval2 as an alternative read trimmer ### `Changed` diff --git a/modules.json b/modules.json index 1b1164a9..b9a2bb74 100644 --- a/modules.json +++ b/modules.json @@ -3,6 +3,9 @@ "homePage": "https://github.com/nf-core/mag", "repos": { "nf-core/modules": { + "adapterremoval": { + "git_sha": "f0800157544a82ae222931764483331a81812012" + }, "bcftools/consensus": { "git_sha": "20d8250d9f39ddb05dfb437603aaf99b5c0b2b41" }, diff --git a/modules/local/multiqc.nf b/modules/local/multiqc.nf index 5faa7a4f..778eb97f 100644 --- a/modules/local/multiqc.nf +++ b/modules/local/multiqc.nf @@ -10,12 +10,12 @@ process MULTIQC { path multiqc_files path mqc_custom_config path 'fastqc_raw/*' - path 'fastp/*' path 'fastqc_trimmed/*' path host_removal path 'quast*/*' path 'bowtie2log/*' path short_summary + path additional output: path "*multiqc_report.html", emit: report diff --git a/modules/nf-core/modules/adapterremoval/main.nf b/modules/nf-core/modules/adapterremoval/main.nf new file mode 100644 index 00000000..9d16b9c9 --- /dev/null +++ b/modules/nf-core/modules/adapterremoval/main.nf @@ -0,0 +1,70 @@ +process ADAPTERREMOVAL { + tag "$meta.id" + label 'process_medium' + + conda (params.enable_conda ? "bioconda::adapterremoval=2.3.2" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/adapterremoval:2.3.2--hb7ba0dd_0' : + 'quay.io/biocontainers/adapterremoval:2.3.2--hb7ba0dd_0' }" + + input: + tuple val(meta), path(reads) + path(adapterlist) + + output: + tuple val(meta), path("${prefix}.truncated.gz") , optional: true, emit: singles_truncated + tuple val(meta), path("${prefix}.discarded.gz") , optional: true, emit: discarded + tuple val(meta), path("${prefix}.pair1.truncated.gz") , optional: true, emit: pair1_truncated + tuple val(meta), path("${prefix}.pair2.truncated.gz") , optional: true, emit: pair2_truncated + tuple val(meta), path("${prefix}.collapsed.gz") , optional: true, emit: collapsed + tuple val(meta), path("${prefix}.collapsed.truncated.gz") , optional: true, emit: collapsed_truncated + tuple val(meta), path("${prefix}.paired.gz") , optional: true, emit: paired_interleaved + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def list = adapterlist ? "--adapter-list ${adapterlist}" : "" + prefix = task.ext.prefix ?: "${meta.id}" + + if (meta.single_end) { + """ + AdapterRemoval \\ + --file1 $reads \\ + $args \\ + $adapterlist \\ + --basename ${prefix} \\ + --threads ${task.cpus} \\ + --settings ${prefix}.log \\ + --seed 42 \\ + --gzip + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + adapterremoval: \$(AdapterRemoval --version 2>&1 | sed -e "s/AdapterRemoval ver. //g") + END_VERSIONS + """ + } else { + """ + AdapterRemoval \\ + --file1 ${reads[0]} \\ + --file2 ${reads[1]} \\ + $args \\ + $adapterlist \\ + --basename ${prefix} \\ + --threads $task.cpus \\ + --settings ${prefix}.log \\ + --seed 42 \\ + --gzip + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + adapterremoval: \$(AdapterRemoval --version 2>&1 | sed -e "s/AdapterRemoval ver. //g") + END_VERSIONS + """ + } + +} diff --git a/modules/nf-core/modules/adapterremoval/meta.yml b/modules/nf-core/modules/adapterremoval/meta.yml new file mode 100644 index 00000000..5faad043 --- /dev/null +++ b/modules/nf-core/modules/adapterremoval/meta.yml @@ -0,0 +1,90 @@ +name: adapterremoval +description: Trim sequencing adapters and collapse overlapping reads +keywords: + - trimming + - adapters + - merging + - fastq +tools: + - adapterremoval: + description: The AdapterRemoval v2 tool for merging and clipping reads. + homepage: https://github.com/MikkelSchubert/adapterremoval + documentation: https://adapterremoval.readthedocs.io + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + pattern: "*.{fq,fastq,fq.gz,fastq.gz}" + - adapterlist: + type: file + description: Optional text file containing list of adapters to look for for removal + with one adapter per line. Otherwise will look for default adapters (see + AdapterRemoval man page), or can be modified to remove user-specified + adapters via ext.args. + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - singles_truncated: + type: file + description: | + Adapter trimmed FastQ files of either single-end reads, or singleton + 'orphaned' reads from merging of paired-end data (i.e., one of the pair + was lost due to filtering thresholds). + pattern: "*.truncated.gz" + - discarded: + type: file + description: | + Adapter trimmed FastQ files of reads that did not pass filtering + thresholds. + pattern: "*.discarded.gz" + - pair1_truncated: + type: file + description: | + Adapter trimmed R1 FastQ files of paired-end reads that did not merge + with their respective R2 pair due to long templates. The respective pair + is stored in 'pair2_truncated'. + pattern: "*.pair1.truncated.gz" + - pair2_truncated: + type: file + description: | + Adapter trimmed R2 FastQ files of paired-end reads that did not merge + with their respective R1 pair due to long templates. The respective pair + is stored in 'pair1_truncated'. + pattern: "*.pair2.truncated.gz" + - collapsed: + type: file + description: | + Collapsed FastQ of paired-end reads that successfully merged with their + respective R1 pair but were not trimmed. + pattern: "*.collapsed.gz" + - collapsed_truncated: + type: file + description: | + Collapsed FastQ of paired-end reads that successfully merged with their + respective R1 pair and were trimmed of adapter due to sufficient overlap. + pattern: "*.collapsed.truncated.gz" + - log: + type: file + description: AdapterRemoval log file + pattern: "*.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@maxibor" + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index 88f83e6c..e3b67f64 100644 --- a/nextflow.config +++ b/nextflow.config @@ -14,11 +14,12 @@ params { single_end = false // short read preprocessing options + clip_tool = 'fastp' save_trimmed_fail = false fastp_qualified_quality = 15 fastp_cut_mean_quality = 15 keep_phix = false - // phix_reference = "ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/viral/Enterobacteria_phage_phiX174_sensu_lato/all_assembly_versions/GCA_002596845.1_ASM259684v1/GCA_002596845.1_ASM259684v1_genomic.fna.gz" + // phix_reference = "ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/viral/Enterobacteria_phage_phiX174_sensu_lato/all_assembly_versions/GCA_002596845.1_ASM259684v1/GCA_002596845.1_ASM259684v1_genomic.fna.gz" phix_reference = "${baseDir}/assets/data/GCA_002596845.1_ASM259684v1_genomic.fna.gz" host_fasta = null host_genome = null diff --git a/subworkflows/local/binning.nf b/subworkflows/local/binning.nf index 28871713..6ba4b02d 100644 --- a/subworkflows/local/binning.nf +++ b/subworkflows/local/binning.nf @@ -174,5 +174,5 @@ workflow BINNING { unbinned = ch_splitfasta_results_gunzipped.groupTuple() unbinned_gz = SPLIT_FASTA.out.unbinned depths_summary = MAG_DEPTHS_SUMMARY.out.summary - versions = ch_versions.dump(tag: "binning_versions") + versions = ch_versions } diff --git a/workflows/mag.nf b/workflows/mag.nf index c43d8ee5..32dbbb17 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -102,12 +102,14 @@ include { ANCIENT_DNA_ASSEMLY_VALIDATION } from '../subworkflows/local/ancient_d // // MODULE: Installed directly from nf-core/modules // -include { FASTQC as FASTQC_RAW } from '../modules/nf-core/modules/fastqc/main' -include { FASTQC as FASTQC_TRIMMED } from '../modules/nf-core/modules/fastqc/main' -include { FASTP } from '../modules/nf-core/modules/fastp/main' -include { PRODIGAL } from '../modules/nf-core/modules/prodigal/main' -include { PROKKA } from '../modules/nf-core/modules/prokka/main' -include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' +include { FASTQC as FASTQC_RAW } from '../modules/nf-core/modules/fastqc/main' +include { FASTQC as FASTQC_TRIMMED } from '../modules/nf-core/modules/fastqc/main' +include { FASTP } from '../modules/nf-core/modules/fastp/main' +include { ADAPTERREMOVAL as ADAPTERREMOVAL_PE } from '../modules/nf-core/modules/adapterremoval/main' +include { ADAPTERREMOVAL as ADAPTERREMOVAL_SE } from '../modules/nf-core/modules/adapterremoval/main' +include { PRODIGAL } from '../modules/nf-core/modules/prodigal/main' +include { PROKKA } from '../modules/nf-core/modules/prokka/main' +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' //////////////////////////////////////////////////// /* -- Create channel for reference databases -- */ @@ -223,13 +225,45 @@ workflow MAG { ) ch_versions = ch_versions.mix(FASTQC_RAW.out.versions.first()) - FASTP ( - ch_raw_short_reads, - [], - [] - ) - ch_short_reads = FASTP.out.reads - ch_versions = ch_versions.mix(FASTP.out.versions.first()) + if ( params.clip_tool == 'fastp' ) { + ch_clipmerge_out = FASTP ( + ch_raw_short_reads, + [], + [] + ) + ch_short_reads = FASTP.out.reads + ch_versions = ch_versions.mix(FASTP.out.versions.first()) + + } else if ( params.clip_tool == 'adapterremoval' ) { + + // due to strange output file scheme in AR2, have to manually separate + // SE/PE to allow correct pulling of reads after. + ch_adapterremoval_in = ch_raw_short_reads + .branch { + single: it[0]['single_end'] + paired: !it[0]['single_end'] + } + + ADAPTERREMOVAL_PE ( ch_adapterremoval_in.paired, [] ) + ADAPTERREMOVAL_SE ( ch_adapterremoval_in.single, [] ) + + // pair1 and 2 come for PE data from separate output channels, so bring + // this back together again here + ch_adapterremoval_pe_out = Channel.empty() + ch_adapterremoval_pe_out = ADAPTERREMOVAL_PE.out.pair1_truncated + .join(ADAPTERREMOVAL_PE.out.pair2_truncated) + .dump(tag: "ar_pe_mix_out") + .map { + [ it[0], [it[1], it[2]] ] + } + .dump(tag: "ar2_pe_out") + + ch_short_reads = Channel.empty() + ch_short_reads = ch_short_reads.mix(ADAPTERREMOVAL_SE.out.singles_truncated, ch_adapterremoval_pe_out) + + ch_versions = ch_versions.mix(ADAPTERREMOVAL_PE.out.versions, ADAPTERREMOVAL_SE.out.versions) + + } if (params.host_fasta){ BOWTIE2_HOST_REMOVAL_BUILD ( @@ -544,7 +578,7 @@ workflow MAG { /* * BUSCO subworkflow: Quantitative measures for the assessment of genome assembly */ - ch_input_bins_busco = BINNING.out.bins.mix( BINNING.out.unbinned ).transpose().dump(tag: "input_to_busco") + ch_input_bins_busco = BINNING.out.bins.mix( BINNING.out.unbinned ).transpose() BUSCO_QC ( ch_busco_db_file, ch_busco_download_folder, @@ -570,7 +604,6 @@ workflow MAG { def new_reads = reads.flatten() [meta, new_reads] } - .dump(tag: "input_for_quast_bins") QUAST_BINS ( ch_input_for_quast_bins ) ch_versions = ch_versions.mix(QUAST_BINS.out.versions.first()) QUAST_BINS_SUMMARY ( QUAST_BINS.out.quast_bin_summaries.collect() ) @@ -667,16 +700,24 @@ workflow MAG { */ //This is the local module because (1) fastq file name clashes [probably solveable] and (2) host removal bowtie reporting + ch_multiqc_additional = Channel.empty() + + if ( params.clip_tool == "fastp") { + ch_multiqc_additional = ch_multiqc_additional.mix(FASTP.out.json.collect{it[1]}.ifEmpty([])) + } else if ( params.clip_tool == "adapterremoval" ) { + ch_multiqc_additional = ch_multiqc_additional.mix(ADAPTERREMOVAL_PE.out.log.collect{it[1]}.ifEmpty([]), ADAPTERREMOVAL_SE.out.log.collect{it[1]}.ifEmpty([])) + } + MULTIQC ( ch_multiqc_files.collect(), ch_multiqc_custom_config.collect().ifEmpty([]), FASTQC_RAW.out.zip.collect{it[1]}.ifEmpty([]), - FASTP.out.json.collect{it[1]}.ifEmpty([]), FASTQC_TRIMMED.out.zip.collect{it[1]}.ifEmpty([]), ch_bowtie2_removal_host_multiqc.collect{it[1]}.ifEmpty([]), ch_quast_multiqc.collect().ifEmpty([]), ch_bowtie2_assembly_multiqc.collect().ifEmpty([]), - ch_busco_multiqc.collect().ifEmpty([]) + ch_busco_multiqc.collect().ifEmpty([]), + ch_multiqc_additional ) multiqc_report = MULTIQC.out.report.toList() ch_versions = ch_versions.mix(MULTIQC.out.versions) From 1959af760e057e89b77f7744c984e89e53bdd433 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 23 Mar 2022 14:36:04 +0100 Subject: [PATCH 2/6] Add AdapterRemoval2 options and docs --- CHANGELOG.md | 2 +- assets/multiqc_config.yaml | 5 +++++ conf/modules.config | 33 ++++++++++++++++++++++++++- docs/output.md | 14 ++++++++++++ modules/local/multiqc.nf | 6 ++--- nextflow.config | 7 +++++- nextflow_schema.json | 46 ++++++++++++++++++++++++++++++++------ workflows/mag.nf | 4 ++-- 8 files changed, 102 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b583685..d916cd96 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#263](https://github.com/nf-core/mag/pull/263) - Restructure binning subworkflow in preparation for aDNA workflow and extended binning - [#247](https://github.com/nf-core/mag/pull/247) - Add ancient DNA subworkflow - [#263](https://github.com/nf-core/mag/pull/263) - Add MaxBin2 as second contig binning tool -- [#264](https://github.com/nf-core/mag/issues/284) - Add AdapterRemoval2 as an alternative read trimmer +- [#264](https://github.com/nf-core/mag/pull/265) - Add AdapterRemoval2 as an alternative read trimmer ### `Changed` diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index 8c0214e3..65639f09 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -17,6 +17,8 @@ top_modules: name: 'FastQC: raw reads' path_filters_exclude: - '*trimmed*' +- 'fastp' +- 'adapterRemoval': - custom_content - 'fastqc': name: 'FastQC: after preprocessing' @@ -42,6 +44,9 @@ custom_data: sp: host_removal: fn: 'host_removal_metrics.tsv' + adapterRemoval: + fn: '*_ar2_*.log' extra_fn_clean_exts: - '.bowtie2' + - '_ar2' diff --git a/conf/modules.config b/conf/modules.config index 7b3bfc49..5566ed6b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -34,7 +34,8 @@ process { "-q ${params.fastp_qualified_quality}", "--cut_front", "--cut_tail", - "--cut_mean_quality ${params.fastp_cut_mean_quality}" + "--cut_mean_quality ${params.fastp_cut_mean_quality}", + "--length_required ${params.reads_minlength} " ].join(' ').trim() publishDir = [ path: { "${params.outdir}/QC_shortreads/fastp/${meta.id}" }, @@ -43,6 +44,36 @@ process { ] } + withName: ADAPTERREMOVAL_PE { + ext.args = [ + "--minlength ${params.reads_minlength}", + "--adapter1 ${params.adapterremoval_adapter1} --adapter2 ${params.adapterremoval_adapter2}", + "--minquality ${params.adapterremoval_minquality} --trimns", + params.adapterremoval_trim_quality_stretch ? "--trim_qualities" : "--trimwindows 4" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/QC_shortreads/adapterremoval/${meta.id}" }, + mode: 'copy', + pattern: "*.{log}" + ] + ext.prefix = { "${meta.id}_ar2" } + } + + withName: ADAPTERREMOVAL_SE { + ext.args = [ + "--minlength ${params.reads_minlength}", + "--adapter1 ${params.adapterremoval_adapter1}", + "--minquality ${params.adapterremoval_minquality} --trimns", + params.adapterremoval_trim_quality_stretch ? "--trim_qualities" : "--trimwindows 4" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/QC_shortreads/adapterremoval/${meta.id}" }, + mode: 'copy', + pattern: "*.{log}" + ] + ext.prefix = { "${meta.id}_ar2" } + } + withName: BOWTIE2_PHIX_REMOVAL_ALIGN { ext.suffix = "phix_removed" publishDir = [ diff --git a/docs/output.md b/docs/output.md index a389e697..e29274e4 100644 --- a/docs/output.md +++ b/docs/output.md @@ -57,6 +57,20 @@ FastQC is run for visualising the general quality metrics of the sequencing runs +### AdapterRemoval2 + +[fastp](https://github.com/OpenGene/fastp) is a all-in-one fastq preprocessor for read/adapter trimming and quality control. It is used in this pipeline for trimming adapter sequences and discard low-quality reads. Its output is in the results folder and part of the MultiQC report. + +[AdapterRemoval](https://adapterremoval.readthedocs.io/en/stable/) searches for and removes remnant adapter sequences from High-Throughput Sequencing (HTS) data and (optionally) trims low quality bases from the 3' end of reads following adapter removal. It is popular in the field of palaeogenomics. The output logs are stored in the results folder, and as a part of the MultiQC report. + +
+Output files + +* `QC_shortreads/adapterremoval/[sample]/` + * `[sample]_ar2.log`: AdapterRemoval log file (normally called `.settings` by AdapterRemoval.) + +
+ ### Remove PhiX sequences from short reads The pipeline uses bowtie2 to map the reads against PhiX and removes mapped reads. diff --git a/modules/local/multiqc.nf b/modules/local/multiqc.nf index 778eb97f..6d8b2cbb 100644 --- a/modules/local/multiqc.nf +++ b/modules/local/multiqc.nf @@ -1,10 +1,10 @@ process MULTIQC { label 'process_medium' - conda (params.enable_conda ? "bioconda::multiqc=1.11" : null) + conda (params.enable_conda ? "bioconda::multiqc=1.12" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.11--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.11--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.12--pyhdfd78af_0' : + 'quay.io/biocontainers/multiqc:1.12--pyhdfd78af_0' }" input: path multiqc_files diff --git a/nextflow.config b/nextflow.config index e3b67f64..61017da5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -15,9 +15,14 @@ params { // short read preprocessing options clip_tool = 'fastp' - save_trimmed_fail = false + reads_minlength = 15 + fastp_save_trimmed_fail = false fastp_qualified_quality = 15 fastp_cut_mean_quality = 15 + adapterremoval_minquality = 2 + adapterremoval_adapter1 = 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG' + adapterremoval_adapter2 = 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT' + adapterremoval_trim_quality_stretch = false keep_phix = false // phix_reference = "ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/viral/Enterobacteria_phage_phiX174_sensu_lato/all_assembly_versions/GCA_002596845.1_ASM259684v1/GCA_002596845.1_ASM259684v1_genomic.fna.gz" phix_reference = "${baseDir}/assets/data/GCA_002596845.1_ASM259684v1_genomic.fna.gz" diff --git a/nextflow_schema.json b/nextflow_schema.json index be926f97..d29f81c7 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -264,16 +264,24 @@ "description": "", "default": "", "properties": { - "save_trimmed_fail": { - "type": "boolean", - "fa_icon": "fas fa-save", - "description": "Save the by fastp trimmed FastQ files in the results directory.", - "help_text": "By default, trimmed FastQ files will not be saved to the results directory. Specify this flag (or set to true in your config file) to copy these files to the results directory when complete." + "clip_tool": { + "type": "string", + "default": "fastp", + "description": "Specify which adapter clipping tool to use. Options: 'fastp', 'adapterremoval'", + "enum": [ + "fastp", + "adapterremoval" + ] + }, + "reads_minlength": { + "type": "integer", + "default": 15, + "description": "The minimum length of reads must have to be retained for downstream analysis." }, "fastp_qualified_quality": { "type": "integer", "default": 15, - "description": "Minimum phred quality value of a base to be qualified.", + "description": "Minimum phred quality value of a base to be qualified in fastp.", "help": "Reads with more than 40% of unqualified bases will be discarded." }, "fastp_cut_mean_quality": { @@ -282,6 +290,30 @@ "description": "The mean quality requirement used for per read sliding window cutting by fastp.", "help": "Used in combination with the fastp options '--cut_front' and '--cut_tail'. If the mean quality within a window (of size 4) is below `--fastp_cut_mean_quality`, the bases are dropped and the sliding window is moved further, otherwise it stops." }, + "fastp_save_trimmed_fail": { + "type": "boolean", + "description": "Save reads that fail fastp filtering in a separate file. Not used downstream." + }, + "adapterremoval_minquality": { + "type": "integer", + "default": 2, + "description": "The minimum base quality for low-quality base trimming by AdapterRemoval." + }, + "adapterremoval_trim_quality_stretch": { + "type": "boolean", + "description": "Turn on quality trimming by consecutive stretch of low quality bases, rather than by window.", + "help_text": "Default base-quality trimming is set to trim by 'windows', as in FastP. Specifying this flag will use trim via contiguous stretch of low quality bases (Ns) instead.\n\n> Replaces --trimwindows 4 with --trimqualities in AdapterRemoval" + }, + "adapterremoval_adapter1": { + "type": "string", + "default": "AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG", + "description": "Forward read adapter to be trimmed by AdapterRemoval." + }, + "adapterremoval_adapter2": { + "type": "string", + "default": "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT", + "description": "Reverse read adapter to be trimmed by AdapterRemoval for paired end data." + }, "host_genome": { "type": "string", "help_text": "This parameter is mutually exclusive with `--host_genome`. Host read removal is done with Bowtie2. \nBoth the iGenomes FASTA file as well as corresponding, already pre-built Bowtie 2 index files will be used.", @@ -661,4 +693,4 @@ "$ref": "#/definitions/ancient_dna_assembly" } ] -} +} \ No newline at end of file diff --git a/workflows/mag.nf b/workflows/mag.nf index 32dbbb17..c27c7384 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -228,7 +228,7 @@ workflow MAG { if ( params.clip_tool == 'fastp' ) { ch_clipmerge_out = FASTP ( ch_raw_short_reads, - [], + params.fastp_save_trimmed_fail, [] ) ch_short_reads = FASTP.out.reads @@ -717,7 +717,7 @@ workflow MAG { ch_quast_multiqc.collect().ifEmpty([]), ch_bowtie2_assembly_multiqc.collect().ifEmpty([]), ch_busco_multiqc.collect().ifEmpty([]), - ch_multiqc_additional + ch_multiqc_additional.collect().ifEmpty([]), ) multiqc_report = MULTIQC.out.report.toList() ch_versions = ch_versions.mix(MULTIQC.out.versions) From cd05e4c987b50c1b924743754b2be1c13f547ee8 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 23 Mar 2022 14:42:23 +0100 Subject: [PATCH 3/6] Add tests --- .github/workflows/ci.yml | 2 +- conf/test_adapterremoval.config | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 conf/test_adapterremoval.config diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 45d45ad8..6bb8bdf8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -55,7 +55,7 @@ jobs: strategy: matrix: # Run remaining test profiles with minimum nextflow version - profile: [test_host_rm, test_hybrid, test_hybrid_host_rm, test_busco_auto, test_ancient_dna] + profile: [test_host_rm, test_hybrid, test_hybrid_host_rm, test_busco_auto, test_ancient_dna, test_adapterremoval] steps: - name: Check out pipeline code uses: actions/checkout@v2 diff --git a/conf/test_adapterremoval.config b/conf/test_adapterremoval.config new file mode 100644 index 00000000..a9f71433 --- /dev/null +++ b/conf/test_adapterremoval.config @@ -0,0 +1,32 @@ +/* +======================================================================================== + Nextflow config file for running minimal tests +======================================================================================== + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/mag -profile test, + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.csv' + centrifuge_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_cf.tar.gz" + kraken2_db = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_kraken.tgz" + skip_krona = true + min_length_unbinned_contigs = 1 + max_unbinned_contigs = 2 + busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" + gtdb = false + clip_tool = 'adapterremoval' +} From 8513246a6503762449bf595d74d956bffd7bf1dd Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 23 Mar 2022 14:51:09 +0100 Subject: [PATCH 4/6] Add new test to nextflow.config --- nextflow.config | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 61017da5..2c409bef 100644 --- a/nextflow.config +++ b/nextflow.config @@ -202,7 +202,9 @@ profiles { test_hybrid_host_rm { includeConfig 'conf/test_hybrid_host_rm.config' } test_busco_auto { includeConfig 'conf/test_busco_auto.config' } test_full { includeConfig 'conf/test_full.config' } - test_ancient_dna { includeConfig 'conf/test_ancient_dna.config' } + test_ancient_dna { includeConfig 'conf/test_ancient_dna.config' } + test_adapterremoval { includeConfig 'conf/test_adapterremoval' } + } // Load igenomes.config if required From 0a6c4a0cba20dd4deafa24fcc5247894b2fb3d44 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 23 Mar 2022 14:58:39 +0100 Subject: [PATCH 5/6] Forgot a file suffix --- conf/test_adapterremoval.config | 4 ++-- nextflow.config | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/test_adapterremoval.config b/conf/test_adapterremoval.config index a9f71433..0725cbc2 100644 --- a/conf/test_adapterremoval.config +++ b/conf/test_adapterremoval.config @@ -11,8 +11,8 @@ */ params { - config_profile_name = 'Test profile' - config_profile_description = 'Minimal test dataset to check pipeline function' + config_profile_name = 'Test profile for running with AdapterRemoval' + config_profile_description = 'Minimal test dataset to check pipeline function with AdapterRemoval data' // Limit resources so that this can run on GitHub Actions max_cpus = 2 diff --git a/nextflow.config b/nextflow.config index 2c409bef..eed41039 100644 --- a/nextflow.config +++ b/nextflow.config @@ -203,7 +203,7 @@ profiles { test_busco_auto { includeConfig 'conf/test_busco_auto.config' } test_full { includeConfig 'conf/test_full.config' } test_ancient_dna { includeConfig 'conf/test_ancient_dna.config' } - test_adapterremoval { includeConfig 'conf/test_adapterremoval' } + test_adapterremoval { includeConfig 'conf/test_adapterremoval.config' } } From ca0e4757eb3f11f19c6519a3f683154aafa9391c Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 24 Mar 2022 11:11:16 +0100 Subject: [PATCH 6/6] Apply suggestions from code review Co-authored-by: Daniel Straub <42973691+d4straub@users.noreply.github.com> --- CHANGELOG.md | 2 +- conf/modules.config | 2 +- docs/output.md | 2 -- workflows/mag.nf | 2 -- 4 files changed, 2 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d916cd96..2e8133f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#263](https://github.com/nf-core/mag/pull/263) - Restructure binning subworkflow in preparation for aDNA workflow and extended binning - [#247](https://github.com/nf-core/mag/pull/247) - Add ancient DNA subworkflow - [#263](https://github.com/nf-core/mag/pull/263) - Add MaxBin2 as second contig binning tool -- [#264](https://github.com/nf-core/mag/pull/265) - Add AdapterRemoval2 as an alternative read trimmer +- [#284](https://github.com/nf-core/mag/pull/285) - Add AdapterRemoval2 as an alternative read trimmer ### `Changed` diff --git a/conf/modules.config b/conf/modules.config index 5566ed6b..4119fb6e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -35,7 +35,7 @@ process { "--cut_front", "--cut_tail", "--cut_mean_quality ${params.fastp_cut_mean_quality}", - "--length_required ${params.reads_minlength} " + "--length_required ${params.reads_minlength}" ].join(' ').trim() publishDir = [ path: { "${params.outdir}/QC_shortreads/fastp/${meta.id}" }, diff --git a/docs/output.md b/docs/output.md index e29274e4..3e4b4de7 100644 --- a/docs/output.md +++ b/docs/output.md @@ -59,8 +59,6 @@ FastQC is run for visualising the general quality metrics of the sequencing runs ### AdapterRemoval2 -[fastp](https://github.com/OpenGene/fastp) is a all-in-one fastq preprocessor for read/adapter trimming and quality control. It is used in this pipeline for trimming adapter sequences and discard low-quality reads. Its output is in the results folder and part of the MultiQC report. - [AdapterRemoval](https://adapterremoval.readthedocs.io/en/stable/) searches for and removes remnant adapter sequences from High-Throughput Sequencing (HTS) data and (optionally) trims low quality bases from the 3' end of reads following adapter removal. It is popular in the field of palaeogenomics. The output logs are stored in the results folder, and as a part of the MultiQC report.
diff --git a/workflows/mag.nf b/workflows/mag.nf index c27c7384..2f05e923 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -252,11 +252,9 @@ workflow MAG { ch_adapterremoval_pe_out = Channel.empty() ch_adapterremoval_pe_out = ADAPTERREMOVAL_PE.out.pair1_truncated .join(ADAPTERREMOVAL_PE.out.pair2_truncated) - .dump(tag: "ar_pe_mix_out") .map { [ it[0], [it[1], it[2]] ] } - .dump(tag: "ar2_pe_out") ch_short_reads = Channel.empty() ch_short_reads = ch_short_reads.mix(ADAPTERREMOVAL_SE.out.singles_truncated, ch_adapterremoval_pe_out)