diff --git a/subworkflows/nf-core/bam_subsampledepth_samtools/main.nf b/subworkflows/nf-core/bam_subsampledepth_samtools/main.nf new file mode 100644 index 000000000000..b7f54e630468 --- /dev/null +++ b/subworkflows/nf-core/bam_subsampledepth_samtools/main.nf @@ -0,0 +1,51 @@ +include { SAMTOOLS_DEPTH } from '../../../modules/nf-core/samtools/depth' +include { GAWK } from '../../../modules/nf-core/gawk' +include { SAMTOOLS_VIEW } from '../../../modules/nf-core/samtools/view' + +workflow BAM_SUBSAMPLEDEPTH_SAMTOOLS { + + take: + ch_bam_bai_depth // channel: [ val(meta), path(bam), path(bai), val(depth) ] + ch_fasta // channel: [ val(meta), path(fasta) ] + + main: + ch_versions = Channel.empty() + + // Compute mean depth + SAMTOOLS_DEPTH(ch_bam_bai_depth.map{ it[0..2] }, [[], []]) + ch_versions = ch_versions.mix(SAMTOOLS_DEPTH.out.versions.first()) + + // Use GAWK to get mean depth + GAWK(SAMTOOLS_DEPTH.out.tsv, []) + ch_versions = ch_versions.mix(GAWK.out.versions.first()) + + // Compute downsampling factor + ch_mean_depth = GAWK.out.output + .splitCsv(header: false, sep:'\t') + .map{ meta, row -> + [ meta, row[0] as Float ] + } + + // Add all necessary channel for downsampling + ch_input_subsample = ch_bam_bai_depth + .join(ch_mean_depth) + .map{ meta, bam, index, depth, mean -> + [ meta + ['subsample_fraction': depth as Float / mean, 'depth': depth ], bam, index ] + } + + // Downsample + SAMTOOLS_VIEW( + ch_input_subsample, + ch_fasta, + [] + ) + ch_versions = ch_versions.mix(SAMTOOLS_VIEW.out.versions.first()) + + // Aggregate bam and index + ch_bam_subsampled = SAMTOOLS_VIEW.out.bam.mix(SAMTOOLS_VIEW.out.cram, SAMTOOLS_VIEW.out.sam) + .join(SAMTOOLS_VIEW.out.bai.mix(SAMTOOLS_VIEW.out.crai, SAMTOOLS_VIEW.out.csi)) + + emit: + bam_subsampled = ch_bam_subsampled // channel: [ val(meta), path(bam), path(csi) ] + versions = ch_versions // channel: [ path(versions.yml) ] +} diff --git a/subworkflows/nf-core/bam_subsampledepth_samtools/meta.yml b/subworkflows/nf-core/bam_subsampledepth_samtools/meta.yml new file mode 100644 index 000000000000..9690380f7033 --- /dev/null +++ b/subworkflows/nf-core/bam_subsampledepth_samtools/meta.yml @@ -0,0 +1,42 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "bam_subsampledepth_samtools" +description: Subsample a BAM/CRAM/SAM file using samtools to a given mean depth +keywords: + - subsample + - bam + - sam + - cram +components: + - samtools/depth + - samtools/view + - gawk +input: + - ch_bam: + type: file + description: | + The input channel containing the BAM/CRAM/SAM files and their indexes and the depth at which to subsample them. + Structure: [ val(meta), path(bam), path(bai), val(depth) ] + pattern: "*.{bam,cram,sam}" + - ch_fasta: + type: file + description: | + The reference genome channel containing the fasta files and its index + Structure: [ val(meta), path(fasta), path(fai) ] + pattern: "*.{fa(sta)?}" +output: + - bam_subsampled: + type: file + description: | + Channel containing subsampled BAM/CRAM/SAM files and their indexes + Structure: [ val(meta), path(bam), path(csi) ] + pattern: "*.{bam,cram,sam}" + - versions: + type: file + description: | + File containing software versions + Structure: [ path(versions.yml) ] + pattern: "versions.yml" +authors: + - "@louislenezet" +maintainers: + - "@louislenezet" diff --git a/subworkflows/nf-core/bam_subsampledepth_samtools/tests/main.nf.test b/subworkflows/nf-core/bam_subsampledepth_samtools/tests/main.nf.test new file mode 100644 index 000000000000..f57cf0ae8d38 --- /dev/null +++ b/subworkflows/nf-core/bam_subsampledepth_samtools/tests/main.nf.test @@ -0,0 +1,52 @@ +nextflow_workflow { + + name "Test Subworkflow BAM_SUBSAMPLEDEPTH_SAMTOOLS" + script "../main.nf" + config "./nextflow.config" + + workflow "BAM_SUBSAMPLEDEPTH_SAMTOOLS" + + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows/bam_subsampledepth_samtools" + + tag "samtools" + tag "samtools/depth" + tag "samtools/view" + tag "gawk" + + test("Downsample to 4X and 2X") { + when { + workflow { + """ + input[0] = Channel.fromList([ + [ + [id: "NA12878"], + file("https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/individuals/NA12878/NA12878.s.bam", checkIfExist:true), + file("https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/individuals/NA12878/NA12878.s.bam.bai", checkIfExist:true), + ], + [ + [id: "NA19401"], + file("https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/individuals/NA19401/NA19401.s.bam", checkIfExist:true), + file("https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/individuals/NA19401/NA19401.s.bam.bai", checkIfExist:true), + ], + ]). combine( Channel.of(2, 4)) + input[1] = Channel.of([ + [id: "GRCh38"], + file("https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/reference_genome/GRCh38.s.fa.gz", checkIfExist:true), + ]).collect() + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + workflow.out, + workflow.out.bam_subsampled.collect{ [it[0], bam(it[1]).getReads().size()] } + ).match() } + ) + } + } +} diff --git a/subworkflows/nf-core/bam_subsampledepth_samtools/tests/main.nf.test.snap b/subworkflows/nf-core/bam_subsampledepth_samtools/tests/main.nf.test.snap new file mode 100644 index 000000000000..8ff1e9c51878 --- /dev/null +++ b/subworkflows/nf-core/bam_subsampledepth_samtools/tests/main.nf.test.snap @@ -0,0 +1,133 @@ +{ + "Downsample to 4X and 2X": { + "content": [ + { + "0": [ + [ + { + "id": "NA12878", + "subsample_fraction": 0.06201800538763442, + "depth": 2 + }, + "NA12878.bam:md5,c998482010b83365a4889c3fa75ac578", + "NA12878.bam.csi:md5,0b4abd161cdcc2b51571c9cc651df354" + ], + [ + { + "id": "NA12878", + "subsample_fraction": 0.12403601077526884, + "depth": 4 + }, + "NA12878.bam:md5,d6045df32f7c77d5c863b2068739faa9", + "NA12878.bam.csi:md5,3f8f9a17463b6c2391c632681d3f253a" + ], + [ + { + "id": "NA19401", + "subsample_fraction": 0.062137851766009305, + "depth": 2 + }, + "NA19401.bam:md5,6b144e7adc1f9e6711aea0e4772c5937", + "NA19401.bam.csi:md5,36c29896003f2de5306ff427a969058c" + ], + [ + { + "id": "NA19401", + "subsample_fraction": 0.12427570353201861, + "depth": 4 + }, + "NA19401.bam:md5,3b0d20e5f44952135a547c6230117460", + "NA19401.bam.csi:md5,b44701d7d4de827cc1b83aaeb397deaf" + ] + ], + "1": [ + "versions.yml:md5,8019e4c6fa3c1ddea25d64d6bfe5651f", + "versions.yml:md5,b31618773ed8a31f8635bca3da001eeb", + "versions.yml:md5,f134df55b5047c5a8222ca04cf5ec501" + ], + "bam_subsampled": [ + [ + { + "id": "NA12878", + "subsample_fraction": 0.06201800538763442, + "depth": 2 + }, + "NA12878.bam:md5,c998482010b83365a4889c3fa75ac578", + "NA12878.bam.csi:md5,0b4abd161cdcc2b51571c9cc651df354" + ], + [ + { + "id": "NA12878", + "subsample_fraction": 0.12403601077526884, + "depth": 4 + }, + "NA12878.bam:md5,d6045df32f7c77d5c863b2068739faa9", + "NA12878.bam.csi:md5,3f8f9a17463b6c2391c632681d3f253a" + ], + [ + { + "id": "NA19401", + "subsample_fraction": 0.062137851766009305, + "depth": 2 + }, + "NA19401.bam:md5,6b144e7adc1f9e6711aea0e4772c5937", + "NA19401.bam.csi:md5,36c29896003f2de5306ff427a969058c" + ], + [ + { + "id": "NA19401", + "subsample_fraction": 0.12427570353201861, + "depth": 4 + }, + "NA19401.bam:md5,3b0d20e5f44952135a547c6230117460", + "NA19401.bam.csi:md5,b44701d7d4de827cc1b83aaeb397deaf" + ] + ], + "versions": [ + "versions.yml:md5,8019e4c6fa3c1ddea25d64d6bfe5651f", + "versions.yml:md5,b31618773ed8a31f8635bca3da001eeb", + "versions.yml:md5,f134df55b5047c5a8222ca04cf5ec501" + ] + }, + [ + [ + { + "id": "NA12878", + "subsample_fraction": 0.06201800538763442, + "depth": 2 + }, + 1164 + ], + [ + { + "id": "NA12878", + "subsample_fraction": 0.12403601077526884, + "depth": 4 + }, + 2402 + ], + [ + { + "id": "NA19401", + "subsample_fraction": 0.062137851766009305, + "depth": 2 + }, + 1196 + ], + [ + { + "id": "NA19401", + "subsample_fraction": 0.12427570353201861, + "depth": 4 + }, + 2321 + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.3" + }, + "timestamp": "2024-07-24T11:40:16.846985786" + } +} \ No newline at end of file diff --git a/subworkflows/nf-core/bam_subsampledepth_samtools/tests/nextflow.config b/subworkflows/nf-core/bam_subsampledepth_samtools/tests/nextflow.config new file mode 100644 index 000000000000..80b65be71ce6 --- /dev/null +++ b/subworkflows/nf-core/bam_subsampledepth_samtools/tests/nextflow.config @@ -0,0 +1,9 @@ +process { + withName: GAWK { + ext.args2 = "'{ total += \$3 } END { print total/NR }'" + ext.suffix = "txt" + } + withName: SAMTOOLS_VIEW { + ext.args = { "--write-index --subsample ${meta.subsample_fraction}" } + } +} diff --git a/subworkflows/nf-core/bam_subsampledepth_samtools/tests/tags.yml b/subworkflows/nf-core/bam_subsampledepth_samtools/tests/tags.yml new file mode 100644 index 000000000000..f66f34e315f5 --- /dev/null +++ b/subworkflows/nf-core/bam_subsampledepth_samtools/tests/tags.yml @@ -0,0 +1,2 @@ +subworkflows/bam_subsampledepth_samtools: + - subworkflows/nf-core/bam_subsampledepth_samtools/**