From 12beb8c5d5372f4af3f7797b51d3307ab70534c3 Mon Sep 17 00:00:00 2001 From: stavgrossfeld Date: Thu, 28 Dec 2023 10:56:19 -0800 Subject: [PATCH 1/3] sambamba --- modules.json | 5 +++ .../nf-core/sambamba/markdup/environment.yml | 7 +++ modules/nf-core/sambamba/markdup/main.nf | 37 ++++++++++++++++ modules/nf-core/sambamba/markdup/meta.yml | 41 ++++++++++++++++++ .../local/bam_markduplicates_sambamba/main.nf | 43 +++++++++++++++++++ 5 files changed, 133 insertions(+) create mode 100644 modules/nf-core/sambamba/markdup/environment.yml create mode 100644 modules/nf-core/sambamba/markdup/main.nf create mode 100644 modules/nf-core/sambamba/markdup/meta.yml create mode 100644 subworkflows/local/bam_markduplicates_sambamba/main.nf diff --git a/modules.json b/modules.json index dd5387fbb8..dde473b4d3 100644 --- a/modules.json +++ b/modules.json @@ -339,6 +339,11 @@ "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["bam_ngscheckmate"] }, + "sambamba/markdup": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, "samblaster": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", diff --git a/modules/nf-core/sambamba/markdup/environment.yml b/modules/nf-core/sambamba/markdup/environment.yml new file mode 100644 index 0000000000..6bb20af7d9 --- /dev/null +++ b/modules/nf-core/sambamba/markdup/environment.yml @@ -0,0 +1,7 @@ +name: sambamba_markdup +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::sambamba=1.0 diff --git a/modules/nf-core/sambamba/markdup/main.nf b/modules/nf-core/sambamba/markdup/main.nf new file mode 100644 index 0000000000..8a6e35ba2f --- /dev/null +++ b/modules/nf-core/sambamba/markdup/main.nf @@ -0,0 +1,37 @@ +process SAMBAMBA_MARKDUP { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity//sambamba:1.0--h98b6b92_0': + 'biocontainers/sambamba:1.0--h98b6b92_0' }" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("*.bam"), emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + sambamba \\ + markdup \\ + $args \\ + -t $task.cpus \\ + $bam \\ + ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sambamba: \$(echo \$(sambamba --version 2>&1) | awk '{print \$2}' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/sambamba/markdup/meta.yml b/modules/nf-core/sambamba/markdup/meta.yml new file mode 100644 index 0000000000..bf7d210770 --- /dev/null +++ b/modules/nf-core/sambamba/markdup/meta.yml @@ -0,0 +1,41 @@ +name: "sambamba_markdup" +description: find and mark duplicate reads in BAM file +keywords: + - markduplicates + - duplicates + - bam +tools: + - "sambamba": + description: "process your BAM data faster!" + homepage: "https://lomereiter.github.io/sambamba/docs/sambamba-view.html" + documentation: "https://lomereiter.github.io/sambamba/docs/sambamba-view.html" + tool_dev_url: "https://github.com/biod/sambamba" + licence: "['GPL v2']" +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +authors: + - "@BioInf2305" +maintainers: + - "@BioInf2305" diff --git a/subworkflows/local/bam_markduplicates_sambamba/main.nf b/subworkflows/local/bam_markduplicates_sambamba/main.nf new file mode 100644 index 0000000000..7dbea4bbac --- /dev/null +++ b/subworkflows/local/bam_markduplicates_sambamba/main.nf @@ -0,0 +1,43 @@ +// +// MARKDUPLICATES AND QC after mapping +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { CRAM_QC_MOSDEPTH_SAMTOOLS } from '../cram_qc_mosdepth_samtools/main' +include { SAMBAMBA_MARKDUPLICATES } from '../../../modules/nf-core/sambamba/markdup/main' + +workflow BAM_MARKDUPLICATES { + take: + bam // channel: [mandatory] [ meta, bam ] + fasta // channel: [mandatory] [ fasta ] + fasta_fai // channel: [mandatory] [ fasta_fai ] + intervals_bed_combined // channel: [optional] [ intervals_bed ] + + main: + versions = Channel.empty() + reports = Channel.empty() + + // RUN MARKUPDUPLICATES + SAMBAMBA_MARKDUPLICATES(bam, fasta, fasta_fai) + + // Join with the crai file + cram = SAMBAMBA_MARKDUPLICATES.out.cram.join(SAMBAMBA_MARKDUPLICATES.out.crai, failOnDuplicate: true, failOnMismatch: true) + + // QC on CRAM + CRAM_QC_MOSDEPTH_SAMTOOLS(cram, fasta, intervals_bed_combined) + + // Gather all reports generated + reports = reports.mix(SAMBAMBA_MARKDUPLICATES.out.metrics) + reports = reports.mix(CRAM_QC_MOSDEPTH_SAMTOOLS.out.reports) + + // Gather versions of all tools used + versions = versions.mix(SAMBAMBA_MARKDUPLICATES.out.versions) + versions = versions.mix(CRAM_QC_MOSDEPTH_SAMTOOLS.out.versions) + + emit: + cram + reports + + versions // channel: [ versions.yml ] +} From bbe131c7f57a13979ccfc0b65f8fea656ed0e642 Mon Sep 17 00:00:00 2001 From: stavgrossfeld Date: Thu, 28 Dec 2023 11:21:54 -0800 Subject: [PATCH 2/3] add subworkflow and logic for calling from sarek.nf --- .../local/bam_markduplicates_sambamba/main.nf | 8 ++++---- workflows/sarek.nf | 20 ++++++++++++++++++- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/subworkflows/local/bam_markduplicates_sambamba/main.nf b/subworkflows/local/bam_markduplicates_sambamba/main.nf index 7dbea4bbac..accfd11ebb 100644 --- a/subworkflows/local/bam_markduplicates_sambamba/main.nf +++ b/subworkflows/local/bam_markduplicates_sambamba/main.nf @@ -5,9 +5,9 @@ // A when clause condition is defined in the conf/modules.config to determine if the module should be run include { CRAM_QC_MOSDEPTH_SAMTOOLS } from '../cram_qc_mosdepth_samtools/main' -include { SAMBAMBA_MARKDUPLICATES } from '../../../modules/nf-core/sambamba/markdup/main' +include { SAMBAMBA_MARKDUP } from '../../../modules/nf-core/sambamba/markdup/main' -workflow BAM_MARKDUPLICATES { +workflow BAM_MARKDUPLICATES_SAMBAMBA { take: bam // channel: [mandatory] [ meta, bam ] fasta // channel: [mandatory] [ fasta ] @@ -19,10 +19,10 @@ workflow BAM_MARKDUPLICATES { reports = Channel.empty() // RUN MARKUPDUPLICATES - SAMBAMBA_MARKDUPLICATES(bam, fasta, fasta_fai) + SAMBAMBA_MARKDUP(bam) // Join with the crai file - cram = SAMBAMBA_MARKDUPLICATES.out.cram.join(SAMBAMBA_MARKDUPLICATES.out.crai, failOnDuplicate: true, failOnMismatch: true) + cram = SAMBAMBA_MARKDUP.out.cram.join(SAMBAMBA_MARKDUPLICATES.out.crai, failOnDuplicate: true, failOnMismatch: true) // QC on CRAM CRAM_QC_MOSDEPTH_SAMTOOLS(cram, fasta, intervals_bed_combined) diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 99ecd733fc..b6287e4102 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -185,6 +185,7 @@ include { SAMTOOLS_CONVERT as CRAM_TO_BAM_RECAL } from '../modules/nf-core // Mark Duplicates (+QC) include { BAM_MARKDUPLICATES } from '../subworkflows/local/bam_markduplicates/main' +include { BAM_MARKDUPLICATES_SAMBAMBA } from '../subworkflows/local/bam_markduplicates_sambamba/main' include { BAM_MARKDUPLICATES_SPARK } from '../subworkflows/local/bam_markduplicates_spark/main' include { BAM_SENTIEON_DEDUP } from '../subworkflows/local/bam_sentieon_dedup/main' @@ -611,7 +612,24 @@ workflow SAREK { // Gather used softwares versions versions = versions.mix(BAM_SENTIEON_DEDUP.out.versions) - } else { + } else if (params.sambamba_dedup) { + BAM_MARKDUPLICATES_SAMBAMBA( + cram_for_markduplicates, + fasta, + fasta_fai, + intervals_for_preprocessing) + + cram_markduplicates_no_spark = BAM_MARKDUPLICATES.out.cram + + // Gather QC reports + reports = reports.mix(BAM_MARKDUPLICATES.out.reports.collect{ meta, report -> report }) + + // Gather used softwares versions + versions = versions.mix(BAM_MARKDUPLICATES.out.versions) + } + + + else { BAM_MARKDUPLICATES( cram_for_markduplicates, fasta, From 35ba757e574cbd6f5054c54627425a6887c5e252 Mon Sep 17 00:00:00 2001 From: stavgrossfeld Date: Sat, 30 Dec 2023 01:40:03 +0000 Subject: [PATCH 3/3] debug --- modules/nf-core/sambamba/markdup/main.nf | 1 + .../local/bam_markduplicates_sambamba/main.nf | 17 +++++++++++------ workflows/sarek.nf | 10 +++++----- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/modules/nf-core/sambamba/markdup/main.nf b/modules/nf-core/sambamba/markdup/main.nf index 8a6e35ba2f..63cbb02623 100644 --- a/modules/nf-core/sambamba/markdup/main.nf +++ b/modules/nf-core/sambamba/markdup/main.nf @@ -7,6 +7,7 @@ process SAMBAMBA_MARKDUP { 'https://depot.galaxyproject.org/singularity//sambamba:1.0--h98b6b92_0': 'biocontainers/sambamba:1.0--h98b6b92_0' }" + // [[patient:test, sample:test, sex:XX, status:0, id:test, data_type:bam], /workspace/nextflow_dir/work/5f/bd451fdce7965b839d600fcc122530/test.bam] input: tuple val(meta), path(bam) diff --git a/subworkflows/local/bam_markduplicates_sambamba/main.nf b/subworkflows/local/bam_markduplicates_sambamba/main.nf index accfd11ebb..0ce66ab6c6 100644 --- a/subworkflows/local/bam_markduplicates_sambamba/main.nf +++ b/subworkflows/local/bam_markduplicates_sambamba/main.nf @@ -19,21 +19,26 @@ workflow BAM_MARKDUPLICATES_SAMBAMBA { reports = Channel.empty() // RUN MARKUPDUPLICATES + bam.view() SAMBAMBA_MARKDUP(bam) + // bam = SAMBAMBA_MARKDUP.out.bam + // bam.view() + + cram = bam // Join with the crai file - cram = SAMBAMBA_MARKDUP.out.cram.join(SAMBAMBA_MARKDUPLICATES.out.crai, failOnDuplicate: true, failOnMismatch: true) + // bam = SAMBAMBA_MARKDUP.out.bam.join(SAMBAMBA_MARKDUPLICATES.out.bai, failOnDuplicate: true, failOnMismatch: true) // QC on CRAM - CRAM_QC_MOSDEPTH_SAMTOOLS(cram, fasta, intervals_bed_combined) + // CRAM_QC_MOSDEPTH_SAMTOOLS(bam, fasta, intervals_bed_combined) // Gather all reports generated - reports = reports.mix(SAMBAMBA_MARKDUPLICATES.out.metrics) - reports = reports.mix(CRAM_QC_MOSDEPTH_SAMTOOLS.out.reports) + // reports = reports.mix(SAMBAMBA_MARKDUP.out.metrics) + // reports = reports.mix(CRAM_QC_MOSDEPTH_SAMTOOLS.out.reports) // Gather versions of all tools used - versions = versions.mix(SAMBAMBA_MARKDUPLICATES.out.versions) - versions = versions.mix(CRAM_QC_MOSDEPTH_SAMTOOLS.out.versions) + versions = versions.mix(SAMBAMBA_MARKDUP.out.versions) + // versions = versions.mix(CRAM_QC_MOSDEPTH_SAMTOOLS.out.versions) emit: cram diff --git a/workflows/sarek.nf b/workflows/sarek.nf index b6287e4102..c55a3f0de0 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -619,16 +619,16 @@ workflow SAREK { fasta_fai, intervals_for_preprocessing) - cram_markduplicates_no_spark = BAM_MARKDUPLICATES.out.cram + cram_markduplicates_no_spark = BAM_MARKDUPLICATES_SAMBAMBA.out.cram // Gather QC reports - reports = reports.mix(BAM_MARKDUPLICATES.out.reports.collect{ meta, report -> report }) + reports = reports.mix(BAM_MARKDUPLICATES_SAMBAMBA.out.reports.collect{ meta, report -> report }) // Gather used softwares versions - versions = versions.mix(BAM_MARKDUPLICATES.out.versions) + versions = versions.mix(BAM_MARKDUPLICATES_SAMBAMBA.out.versions) } - - + + else { BAM_MARKDUPLICATES( cram_for_markduplicates,