diff --git a/.github/workflows/nf-test.yml b/.github/workflows/nf-test.yml index b0d7b8e..233897e 100644 --- a/.github/workflows/nf-test.yml +++ b/.github/workflows/nf-test.yml @@ -5,21 +5,13 @@ on: [push, pull_request] jobs: test: runs-on: ubuntu-latest - strategy: - matrix: - shard: [1, 2, 3, 4] + steps: - name: Checkout uses: actions/checkout@v4 - - name: Set up JDK 11 - uses: actions/setup-java@v2 - with: - java-version: "11" - distribution: "adopt" - - name: Setup Nextflow latest-edge - uses: nf-core/setup-nextflow@v1 + uses: nf-core/setup-nextflow@v2 with: version: "latest-edge" @@ -28,5 +20,5 @@ jobs: wget -qO- https://get.nf-test.com | bash sudo mv nf-test /usr/local/bin/ - - name: Run Tests (Shard ${{ matrix.shard }}/${{ strategy.job-total }}) - run: nf-test test --ci --shard ${{ matrix.shard }}/${{ strategy.job-total }} . + - name: Run Tests + run: nf-test test --ci tests diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ebf58f..a4b5e28 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ Initial release of nf-core/seqinspector, created with the [nf-core](https://nf-c - [#50](https://github.com/nf-core/seqinspector/pull/50) Add an optional subsampling step. - [#51](https://github.com/nf-core/seqinspector/pull/51) Add nf-test to CI. - [#63](https://github.com/nf-core/seqinspector/pull/63) Contribution guidelines added about displaying results for new tools +- [#53](https://github.com/nf-core/seqinspector/pull/53) Add FastQ-Screen database multiplexing and limit scope of nf-test in CI. ### `Fixed` diff --git a/CITATIONS.md b/CITATIONS.md index 33c75be..208cfa1 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -18,6 +18,10 @@ > Telatin A, Fariselli P, Birolo G. SeqFu: A Suite of Utilities for the Robust and Reproducible Manipulation of Sequence Files. Bioengineering 2021, 8, 59. doi.org/10.3390/bioengineering8050059 +- [FastQ Screen](https://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/) + +> Wingett SW and Andrews S. FastQ Screen: A tool for multi-genome mapping and quality control [version 2; referees: 4 approved]. F1000Research 2018, 7:1338 (https://doi.org/10.12688/f1000research.15931.2) + - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. diff --git a/assets/example_fastq_screen_references.csv b/assets/example_fastq_screen_references.csv new file mode 100644 index 0000000..59f0cdf --- /dev/null +++ b/assets/example_fastq_screen_references.csv @@ -0,0 +1,4 @@ +name,dir,basename,aligner +Ecoli,s3://ngi-igenomes/igenomes/Escherichia_coli_K_12_MG1655/NCBI/2001-10-15/Sequence/Bowtie2Index/,genome,bowtie2 +PhiX,s3://ngi-igenomes/igenomes/PhiX/Illumina/RTA/Sequence/Bowtie2Index/,genome,bowtie2 +Scerevisiae,s3://ngi-igenomes/igenomes/Saccharomyces_cerevisiae/NCBI/build3.1/Sequence/Bowtie2Index/,genome,bowtie2 diff --git a/assets/schema_fastq_screen_references.json b/assets/schema_fastq_screen_references.json new file mode 100644 index 0000000..9a938d9 --- /dev/null +++ b/assets/schema_fastq_screen_references.json @@ -0,0 +1,35 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/nf-core/seqinspector/master/assets/schema_fastq_screen_references.json", + "title": "nf-core/seqinspector pipeline - params.fastq_screen_references schema", + "description": "Schema for the file provided with params.fastq_screen_references", + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "The reference name as referred to by FastQ Screen." + }, + "dir": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+$", + "errorMessage": "Path to the dir containing the aligner reference and index. Can be remote." + }, + "basename": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "The shared basename of the reference and index files contained in the dir." + }, + "aligner": { + "type": "string", + "enum": ["bowtie", "bowtie2", "bwa", "minimap2"], + "errorMessage": "Specify the aligner to use for the mapping. Valid arguments are 'bowtie', bowtie2' (default), 'bwa' or 'minimap2'." + } + }, + "required": ["name", "dir", "basename", "aligner"] + } +} diff --git a/conf/test.config b/conf/test.config index 76a7ad0..0ebc405 100644 --- a/conf/test.config +++ b/conf/test.config @@ -26,6 +26,7 @@ params { // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets // TODO nf-core: Give any required params for the test so that command line flags are not needed input = params.pipelines_testdata_base_path + 'seqinspector/testdata/NovaSeq6000/samplesheet.csv' + fastq_screen_references = "${projectDir}/assets/example_fastq_screen_references.csv" // Genome references genome = 'R64-1-1' diff --git a/docs/output.md b/docs/output.md index 43a1438..dc26159 100644 --- a/docs/output.md +++ b/docs/output.md @@ -13,6 +13,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Seqtk](#seqtk) - Subsample a specific number of reads per sample - [FastQC](#fastqc) - Raw read QC - [SeqFu Stats](#seqfu_stats) - Statistics for FASTA or FASTQ files +- [Fastqscreen](#fastqscreen) - mapping against a set of references for basic contamination QC - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution @@ -41,7 +42,32 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). -#### SeqFu Stats +### FASTQSCREEN + +
+Output files + +- `fastqscreen/` + - `*_screen.html`: Interactive graphical fastqscreen report which summaries the mapping of your sequences against each of your libraries. + - `*_screen.pdf`: Static graphical fastqscreen report which summaries the mapping of your sequences against each of your libraries. + - `*_screen.txt` : text based fastqscreen report which summaries the mapping of your sequences against each of your libraries. + +
+ +[Fastqscreen](https://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/) allows you to set up a standard set of libraries against which all of your sequences can be searched. Your search libraries might contain the genomes of all of the organisms you work on, along with PhiX, Vectors or other contaminants commonly seen in sequencing experiments. + +It requires a `.csv` detailing: + +- the working name of the reference +- the name of the aligner used to generate its index (which is also the aligner and index used by the tool) +- the file basename of the reference and its index (e.g. the reference `genoma.fa` and its index `genome.bt2` have the basename `genome`) +- the path to a dir where the reference and index files both reside. + +See `assets/example_fastq_screen_references.csv` for example. + +The `.csv` is provided as a pipeline parameter `fastq_screen_references`. The `.csv` is used to construct a `FastQ Screen` configuration file within the context of the process work directory in order to properly mount the references. + +### SeqFu Stats
Output files diff --git a/modules.json b/modules.json index 102b391..566d84e 100644 --- a/modules.json +++ b/modules.json @@ -10,6 +10,12 @@ "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", "installed_by": ["modules"] }, + "fastqscreen/fastqscreen": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"], + "patch": "modules/nf-core/fastqscreen/fastqscreen/fastqscreen-fastqscreen.diff" + }, "multiqc": { "branch": "master", "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d", diff --git a/modules/nf-core/fastqscreen/fastqscreen/environment.yml b/modules/nf-core/fastqscreen/fastqscreen/environment.yml new file mode 100644 index 0000000..7d4c892 --- /dev/null +++ b/modules/nf-core/fastqscreen/fastqscreen/environment.yml @@ -0,0 +1,14 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::fastq-screen=0.16.0 + - bioconda::perl-gdgraph=1.54 + # - gzip + # - SAMtools + # - bowtie + # - bowtie2 + # - bwa + # - bismark diff --git a/modules/nf-core/fastqscreen/fastqscreen/fastqscreen-fastqscreen.diff b/modules/nf-core/fastqscreen/fastqscreen/fastqscreen-fastqscreen.diff new file mode 100644 index 0000000..2fbd94b --- /dev/null +++ b/modules/nf-core/fastqscreen/fastqscreen/fastqscreen-fastqscreen.diff @@ -0,0 +1,97 @@ +Changes in module 'nf-core/fastqscreen/fastqscreen' +Changes in 'fastqscreen/fastqscreen/environment.yml': +--- modules/nf-core/fastqscreen/fastqscreen/environment.yml ++++ modules/nf-core/fastqscreen/fastqscreen/environment.yml +@@ -4,5 +4,11 @@ + - conda-forge + - bioconda + dependencies: +- - "bioconda::fastq-screen=0.15.3" ++ - bioconda::fastq-screen=0.16.0 + - bioconda::perl-gdgraph=1.54 ++ # - gzip ++ # - SAMtools ++ # - bowtie ++ # - bowtie2 ++ # - bwa ++ # - bismark + +Changes in 'fastqscreen/fastqscreen/main.nf': +--- modules/nf-core/fastqscreen/fastqscreen/main.nf ++++ modules/nf-core/fastqscreen/fastqscreen/main.nf +@@ -4,12 +4,12 @@ + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? +- 'https://depot.galaxyproject.org/singularity/fastq-screen:0.15.3--pl5321hdfd78af_0': +- 'biocontainers/fastq-screen:0.15.3--pl5321hdfd78af_0'}" ++ 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/fc/fc53eee7ca23c32220a9662fbb63c67769756544b6d74a1ee85cf439ea79a7ee/data' : ++ 'community.wave.seqera.io/library/fastq-screen_perl-gdgraph:5c1786a5d5bc1309'}" + + input: +- tuple val(meta), path(reads) // .fastq files +- path database ++ tuple val(meta), path(reads, arity: '1..2') ++ tuple val(ref_names), path(ref_dirs, name:"ref*"), val(ref_basenames), val(ref_aligners) + + output: + tuple val(meta), path("*.txt") , emit: txt +@@ -24,31 +24,35 @@ + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: "" ++ def config_content = ref_names.withIndex().collect { name, i -> "DATABASE ${name} ./${ref_dirs[i]}/${ref_basenames[i]} ${ref_aligners[i]}" }.join('\n') ++ """ ++ echo '${config_content}' > fastq_screen.conf + +- """ +- fastq_screen --threads ${task.cpus} \\ +- --aligner bowtie2 \\ +- --conf ${database}/fastq_screen.conf \\ ++ fastq_screen \\ ++ --conf fastq_screen.conf \\ ++ --threads ${task.cpus} \\ + $reads \\ +- $args \\ ++ $args + +- cat <<-END_VERSIONS > versions.yml +- "${task.process}": +- fastqscreen: \$(echo \$(fastq_screen --version 2>&1) | sed 's/^.*FastQ Screen v//; s/ .*\$//') +- END_VERSIONS ++ mv *_screen.txt ${prefix}_screen.txt ++ mv *_screen.html ${prefix}_screen.html ++ mv *_screen.png ${prefix}_screen.png ++ ++ fastq_screen_version=\$(fastq_screen --version 2>&1 | sed 's/^.*FastQ Screen v//; s/ .*\$//') ++ echo "\\\"${task.process}\\\":" > versions.yml ++ echo " fastqscreen: \$fastq_screen_version" >> versions.yml + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ +- touch test_1_screen.html +- touch test_1_screen.png +- touch test_1_screen.txt ++ touch ${prefix}_screen.html ++ touch ${prefix}_screen.png ++ touch ${prefix}_screen.txt + +- cat <<-END_VERSIONS > versions.yml +- "${task.process}": +- fastqscreen: \$(echo \$(fastq_screen --version 2>&1) | sed 's/^.*FastQ Screen v//; s/ .*\$//') +- END_VERSIONS ++ fastq_screen_version=\$(fastq_screen --version 2>&1 | sed 's/^.*FastQ Screen v//; s/ .*\$//') ++ echo "\\\"${task.process}\\\":" > versions.yml ++ echo " fastqscreen: \$fastq_screen_version" >> versions.yml + """ + + } + +'modules/nf-core/fastqscreen/fastqscreen/meta.yml' is unchanged +'modules/nf-core/fastqscreen/fastqscreen/tests/main.nf.test' is unchanged +'modules/nf-core/fastqscreen/fastqscreen/tests/main.nf.test.snap' is unchanged +'modules/nf-core/fastqscreen/fastqscreen/tests/nextflow.config' is unchanged +'modules/nf-core/fastqscreen/fastqscreen/tests/tags.yml' is unchanged +************************************************************ diff --git a/modules/nf-core/fastqscreen/fastqscreen/main.nf b/modules/nf-core/fastqscreen/fastqscreen/main.nf new file mode 100644 index 0000000..2eba839 --- /dev/null +++ b/modules/nf-core/fastqscreen/fastqscreen/main.nf @@ -0,0 +1,58 @@ +process FASTQSCREEN_FASTQSCREEN { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/fc/fc53eee7ca23c32220a9662fbb63c67769756544b6d74a1ee85cf439ea79a7ee/data' : + 'community.wave.seqera.io/library/fastq-screen_perl-gdgraph:5c1786a5d5bc1309'}" + + input: + tuple val(meta), path(reads, arity: '1..2') + tuple val(ref_names), path(ref_dirs, name:"ref*"), val(ref_basenames), val(ref_aligners) + + output: + tuple val(meta), path("*.txt") , emit: txt + tuple val(meta), path("*.png") , emit: png , optional: true + tuple val(meta), path("*.html") , emit: html + tuple val(meta), path("*.fastq.gz"), emit: fastq, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: "" + def config_content = ref_names.withIndex().collect { name, i -> "DATABASE ${name} ./${ref_dirs[i]}/${ref_basenames[i]} ${ref_aligners[i]}" }.join('\n') + """ + echo '${config_content}' > fastq_screen.conf + + fastq_screen \\ + --conf fastq_screen.conf \\ + --threads ${task.cpus} \\ + $reads \\ + $args + + mv *_screen.txt ${prefix}_screen.txt + mv *_screen.html ${prefix}_screen.html + mv *_screen.png ${prefix}_screen.png + + fastq_screen_version=\$(fastq_screen --version 2>&1 | sed 's/^.*FastQ Screen v//; s/ .*\$//') + echo "\\\"${task.process}\\\":" > versions.yml + echo " fastqscreen: \$fastq_screen_version" >> versions.yml + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_screen.html + touch ${prefix}_screen.png + touch ${prefix}_screen.txt + + fastq_screen_version=\$(fastq_screen --version 2>&1 | sed 's/^.*FastQ Screen v//; s/ .*\$//') + echo "\\\"${task.process}\\\":" > versions.yml + echo " fastqscreen: \$fastq_screen_version" >> versions.yml + """ + +} diff --git a/modules/nf-core/fastqscreen/fastqscreen/meta.yml b/modules/nf-core/fastqscreen/fastqscreen/meta.yml new file mode 100644 index 0000000..39c86b4 --- /dev/null +++ b/modules/nf-core/fastqscreen/fastqscreen/meta.yml @@ -0,0 +1,78 @@ +name: fastqscreen_fastqscreen +description: Align reads to multiple reference genomes using fastq-screen +keywords: + - align + - map + - fasta + - fastq + - genome + - reference +tools: + - "fastqscreen": + description: "FastQ Screen allows you to screen a library of sequences in FastQ + format against a set of sequence databases so you can see if the composition + of the library matches with what you expect." + homepage: "https://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/" + documentation: "https://stevenwingett.github.io/FastQ-Screen/" + tool_dev_url: "https://github.com/StevenWingett/FastQ-Screen/archive/refs/tags/v0.15.3.zip" + doi: "10.5281/zenodo.5838377" + licence: ["GPL-3.0-or-later"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - - database: + type: directory + description: fastq screen database folder containing config file and index folders + pattern: "FastQ_Screen_Genomes" +output: + - txt: + - meta: + type: map + description: Groovy Map containing sample information + - "*.txt": + type: file + description: TXT file containing alignment statistics + pattern: "*.txt" + - png: + - meta: + type: map + description: Groovy Map containing sample information + - "*.png": + type: file + description: PNG file with graphical representation of alignments + pattern: "*.png" + - html: + - meta: + type: map + description: Groovy Map containing sample information + - "*.html": + type: file + description: HTML file containing mapping results as a table and graphical representation + pattern: "*.html" + - fastq: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.fastq.gz": + type: file + description: FastQ file containing reads that did not align to any database (optional) + pattern: "*.fastq.gz" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@snesic" + - "@JPejovicApis" diff --git a/modules/nf-core/fastqscreen/fastqscreen/tests/main.nf.test b/modules/nf-core/fastqscreen/fastqscreen/tests/main.nf.test new file mode 100644 index 0000000..71230a2 --- /dev/null +++ b/modules/nf-core/fastqscreen/fastqscreen/tests/main.nf.test @@ -0,0 +1,117 @@ +nextflow_process { + + name "Test Process FASTQSCREEN_FASTQSCREEN" + script "../main.nf" + process "FASTQSCREEN_FASTQSCREEN" + + tag "modules" + tag "modules_nfcore" + tag "bowtie2/build" + tag "fastqscreen" + tag "fastqscreen/buildfromindex" + tag "fastqscreen/fastqscreen" + tag "buildfromindex" + tag "modules_fastqscreen" + + setup { + + run("BOWTIE2_BUILD") { + script "../../../bowtie2/build/main.nf" + process { + """ + input[0] = Channel.from([ + [[id: "sarscov2"], file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)], + [[id: "human"] , file(params.test_data['homo_sapiens']['genome']['genome_21_fasta'], checkIfExists: true)] + ]) + """ + } + } + + run("FASTQSCREEN_BUILDFROMINDEX") { + script "../../../fastqscreen/buildfromindex/main.nf" + process { + """ + input[0] = BOWTIE2_BUILD.out.index.map{meta, index -> meta.id}.collect() + input[1] = BOWTIE2_BUILD.out.index.map{meta, index -> index}.collect() + """ + } + } + } + + test("sarscov2 - human") { + + when { + process { + """ + input[0] = [[ id:'test', single_end:true ], + [file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + input[1] = FASTQSCREEN_BUILDFROMINDEX.out.database + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.version).match() }, + { assert file(process.out.txt.get(0).get(1)).exists() }, + { assert file(process.out.png.get(0).get(1)).exists() }, + { assert file(process.out.html.get(0).get(1)).exists() } + ) + } + + } + + test("sarscov2 - human - tags") { + config './nextflow.config' + when { + process { + """ + input[0] = [[ id:'test', single_end:false ], + [file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true)] + ] + input[1] = FASTQSCREEN_BUILDFROMINDEX.out.database + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.version, + process.out.txt, + process.out.fastq, + path(process.out.html.get(0).get(1)).readLines()[0..10], + path(process.out.png.get(0).get(1)).exists() + ).match() } + ) + } + + } + + test("sarscov2 - human - stub") { + + options "-stub" + when { + process { + """ + input[0] = [[ id:'test', single_end:true ], + [file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + input[1] = FASTQSCREEN_BUILDFROMINDEX.out.database + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/fastqscreen/fastqscreen/tests/main.nf.test.snap b/modules/nf-core/fastqscreen/fastqscreen/tests/main.nf.test.snap new file mode 100644 index 0000000..2afffde --- /dev/null +++ b/modules/nf-core/fastqscreen/fastqscreen/tests/main.nf.test.snap @@ -0,0 +1,132 @@ +{ + "sarscov2 - human": { + "content": null, + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-08-31T05:42:29.972454812" + }, + "sarscov2 - human - tags": { + "content": [ + null, + [ + [ + { + "id": "test", + "single_end": false + }, + "test_1_screen.txt:md5,b0b0ea58bc26ebaa4d573a85e7898f25" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.tagged.fastq.gz:md5,f742b162c43ce28f80b89608d5c47f3d", + "test_1.tagged_filter.fastq.gz:md5,28527a76bb0bb3fce0ee76afe01e90aa" + ] + ] + ], + [ + "", + "", + "", + "", + "", + "", + "\t", + "\tFastQ Screen Processing Report - test_1.fastq.gz", + "\t