Skip to content

Commit

Permalink
Add module fgbio/collectduplexseqmetrics (#5960)
Browse files Browse the repository at this point in the history
* Initial commit

* Rerun nf-test

* Change assertion for duplex_qc output

* Add ggplot2 to version.yml

* Update snapshot

* Correct conda packages version
  • Loading branch information
georgiakes authored Jul 17, 2024
1 parent 6494138 commit af50683
Show file tree
Hide file tree
Showing 6 changed files with 355 additions and 0 deletions.
10 changes: 10 additions & 0 deletions modules/nf-core/fgbio/collectduplexseqmetrics/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
name: "fgbio_collectduplexseqmetrics"
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- "bioconda::fgbio=2.0.2"
- "conda-forge::r-ggplot2=3.4.4"
80 changes: 80 additions & 0 deletions modules/nf-core/fgbio/collectduplexseqmetrics/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
process FGBIO_COLLECTDUPLEXSEQMETRICS {
tag "$meta.id"
label 'process_single'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/mulled-v2-51891ad0b60843e4aade9cde2eb5d40c5ae92b80:72c944cdea5caff7f03b96034968ce2a4f1737bc-0':
'biocontainers/mulled-v2-51891ad0b60843e4aade9cde2eb5d40c5ae92b80:72c944cdea5caff7f03b96034968ce2a4f1737bc-0' }"

input:
tuple val(meta), path(grouped_bam)
path interval_list

output:
tuple val(meta), path("**.family_sizes.txt") , emit: family_sizes
tuple val(meta), path("**.duplex_family_sizes.txt") , emit: duplex_family_sizes
tuple val(meta), path("**.duplex_yield_metrics.txt"), emit: duplex_yield_metrics
tuple val(meta), path("**.umi_counts.txt") , emit: umi_counts
tuple val(meta), path("**.duplex_qc.pdf") , emit: duplex_qc
tuple val(meta), path("**.duplex_umi_counts.txt") , emit: duplex_umi_counts, optional: true
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def intervals = interval_list ? "--intervals ${bed}" : ""
def mem_gb = 8

if (!task.memory) {
log.info '[fgbio CollectDuplexSeqMetrics] Available memory not known - defaulting to 8GB. Specify process memory requirements to change this.'
} else if (mem_gb > task.memory.giga) {
if (task.memory.giga < 2) {
mem_gb = 1
} else {
mem_gb = task.memory.giga - 1
}
}

"""
fgbio \\
-Xmx${mem_gb}g \\
--tmp-dir=. \\
--async-io=true \\
--compression=1 \\
CollectDuplexSeqMetrics \\
--input $grouped_bam \\
--output ${prefix} \\
$intervals \\
$args
cat <<-END_VERSIONS > versions.yml
"${task.process}":
fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//')
ggplot2: \$(Rscript -e "library(ggplot2); cat(as.character(packageVersion('ggplot2')))")
END_VERSIONS
"""

stub:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def touch_duplex_umi = args.contains("--duplex-umi-counts") || args.contains("-u") ? "touch ${prefix}.duplex_umi_counts.txt" : ""

"""
touch ${prefix}.family_sizes.txt
touch ${prefix}.duplex_family_sizes.txt
touch ${prefix}.duplex_yield_metrics.txt
touch ${prefix}.umi_counts.txt
touch ${prefix}.duplex_qc.pdf
$touch_duplex_umi
cat <<-END_VERSIONS > versions.yml
"${task.process}":
fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//')
ggplot2: \$(Rscript -e "library(ggplot2); cat(as.character(packageVersion('ggplot2')))")
END_VERSIONS
"""
}
78 changes: 78 additions & 0 deletions modules/nf-core/fgbio/collectduplexseqmetrics/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
---
name: "fgbio_collectduplexseqmetrics"
description: Collects a suite of metrics to QC duplex sequencing data.
keywords:
- UMIs
- QC
- bam
- duplex
tools:
- "fgbio":
description: "A set of tools for working with genomic and high throughput sequencing data, including UMIs"
homepage: "http://fulcrumgenomics.github.io/fgbio/"
documentation: "http://fulcrumgenomics.github.io/fgbio/"
tool_dev_url: "https://github.com/fulcrumgenomics/fgbio"
licence: ["MIT"]
- "r-ggplot2":
description: "ggplot2 is a system for declaratively creating graphics, based on The Grammar of Graphics. "
homepage: "https://ggplot2.tidyverse.org/"
documentation: "https://ggplot2.tidyverse.org/"
tool_dev_url: "https://github.com/tidyverse/ggplot2"
licence: ["MIT"]

input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'sample1' ]`
- grouped_bam:
type: file
description: It has to be either 1)The exact BAM output by the GroupReadsByUmi tool (in the sort-order it was produced in) 2)A BAM file that has MI tags present on all reads (usually set by GroupReadsByUmi and has been sorted with SortBam into TemplateCoordinate order.
pattern: "*.bam"

- interval_list:
type: file
description: Calculation of metrics may be restricted to a set of regions using the --intervals parameter. The file format is descripted here https://samtools.github.io/htsjdk/javadoc/htsjdk/index.html?htsjdk/samtools/util/Interval.html
pattern: "*.{tsv|txt|interval_list}"

output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'sample1' ]`
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
- family_sizes:
type: file
description: Metrics on the frequency of different types of families of different sizes
pattern: "*.txt"
- duplex_family_sizes:
type: file
description: Metrics on the frequency of duplex tag families by the number of observations from each strand
pattern: "*.txt"
- duplex_yield_metrics:
type: file
description: Summary QC metrics produced using 5%, 10%, 15%...100% of the data
pattern: "*.txt"
- umi_counts:
type: file
description: Metrics on the frequency of observations of UMIs within reads and tag families
pattern: "*.txt"
- duplex_qc:
type: file
description: A series of plots generated from the preceding metrics files for visualization
pattern: "*.pdf"
- duplex_umi_counts:
type: file
description: Metrics on the frequency of observations of duplex UMIs within reads and tag families.
pattern: "*.txt"

authors:
- "@georgiakes"
maintainers:
- "@georgiakes"
79 changes: 79 additions & 0 deletions modules/nf-core/fgbio/collectduplexseqmetrics/tests/main.nf.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
nextflow_process {

name "Test Process FGBIO_COLLECTDUPLEXSEQMETRICS"
script "../main.nf"
process "FGBIO_COLLECTDUPLEXSEQMETRICS"

tag "modules"
tag "modules_nfcore"
tag "fgbio"
tag "fgbio/collectduplexseqmetrics"


test("homo_sapiens - bam") {

when {
process {
"""
input[0] = [
[ id:'test', single_end:false ], // meta map
file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.duplex_umi_grouped.bam', checkIfExists: true)
]
input[1]=[]
"""
}
}

then {
assertAll(
{ assert process.success },
{ assert snapshot(process.out.family_sizes,
process.out.duplex_family_sizes,
process.out.duplex_yield_metrics,
process.out.umi_counts,
process.out.duplex_umi_counts,
process.out.versions,
file(process.out.duplex_qc[0][1]).name)
.match() }

)
}

}

test("homo_sapiens - stub") {

options "-stub"

when {
process {
"""
input[0] = [
[ id:'test', single_end:false ], // meta map
file(params.modules_testdata_base_path + '/genomics/homo_sapiens/illumina/bam/umi/test.paired_end.duplex_umi_grouped.bam', checkIfExists: true)
]
input[1] = []
"""
}
}

then {
assertAll(
{ assert process.success },
{ assert snapshot(process.out.family_sizes,
process.out.duplex_family_sizes,
process.out.duplex_yield_metrics,
process.out.umi_counts,
process.out.duplex_umi_counts,
process.out.versions,
file(process.out.duplex_qc[0][1]).name)
.match() }
)
}

}

}
106 changes: 106 additions & 0 deletions modules/nf-core/fgbio/collectduplexseqmetrics/tests/main.nf.test.snap
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
{
"homo_sapiens - stub": {
"content": [
[
[
{
"id": "test",
"single_end": false
},
"test.family_sizes.txt:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
[
[
{
"id": "test",
"single_end": false
},
"test.duplex_family_sizes.txt:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
[
[
{
"id": "test",
"single_end": false
},
"test.duplex_yield_metrics.txt:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
[
[
{
"id": "test",
"single_end": false
},
"test.umi_counts.txt:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
[

],
[
"versions.yml:md5,637a7384cd910f0e0541a631c52b95e1"
],
"test.duplex_qc.pdf"
],
"meta": {
"nf-test": "0.8.4",
"nextflow": "24.04.3"
},
"timestamp": "2024-07-17T19:26:23.325859809"
},
"homo_sapiens - bam": {
"content": [
[
[
{
"id": "test",
"single_end": false
},
"test.family_sizes.txt:md5,a49de49bd587440c316fec830f502620"
]
],
[
[
{
"id": "test",
"single_end": false
},
"test.duplex_family_sizes.txt:md5,129e41170b9f5f2f8edce62a686c8548"
]
],
[
[
{
"id": "test",
"single_end": false
},
"test.duplex_yield_metrics.txt:md5,237e4e4ee713fdf672b0ee796827fb9d"
]
],
[
[
{
"id": "test",
"single_end": false
},
"test.umi_counts.txt:md5,9fe38b2a49ca80492b3a1c6a55679155"
]
],
[

],
[
"versions.yml:md5,637a7384cd910f0e0541a631c52b95e1"
],
"test.duplex_qc.pdf"
],
"meta": {
"nf-test": "0.8.4",
"nextflow": "24.04.3"
},
"timestamp": "2024-07-17T19:26:03.1373243"
}
}
2 changes: 2 additions & 0 deletions modules/nf-core/fgbio/collectduplexseqmetrics/tests/tags.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
fgbio/collectduplexseqmetrics:
- "modules/nf-core/fgbio/collectduplexseqmetrics/**"

0 comments on commit af50683

Please sign in to comment.