From 3711ed6d5f883d36e9cf918b91c70131c53fc8a2 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 23 Apr 2020 13:47:13 +0200 Subject: [PATCH 1/7] add params save_bam_mapped --- docs/usage.md | 5 +++++ nextflow.config | 1 + 2 files changed, 6 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index d3787f7f3b..10b2447410 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -34,6 +34,7 @@ - [--no_intervals](#--no_intervals) - [--target_bed](#--target_bed) - [--targetBED](#--targetbed) + - [--save_bam_mapped](#--save_bam_mapped) - [Reference genomes](#reference-genomes) - [--genome (using iGenomes)](#--genome-using-igenomes) - [--ac_loci](#--ac_loci) @@ -372,6 +373,10 @@ Use this to specify the target BED file for targeted or whole exome sequencing. > :warning: This params is deprecated -- it will be removed in a future release. > Please check: [`--target_bed`](#--target_bed) +### --save_bam_mapped + +Will save mapped BAMs. + ## Reference genomes The pipeline config files come bundled with paths to the Illumina iGenomes reference index files. diff --git a/nextflow.config b/nextflow.config index c9fdbfd836..3bf2d65302 100644 --- a/nextflow.config +++ b/nextflow.config @@ -23,6 +23,7 @@ params { // Results outdir = './results' publish_dir_mode = 'copy' // Default PublishDirMode (same as other nf-core pipelines) + save_bam_mapped = null // Mapped BAMs not saved // Modify fastqs (trim/split) trim_fastq = false // No trimming From 8703acc9316010dbe4d808480d5975221450f890 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 23 Apr 2020 13:51:29 +0200 Subject: [PATCH 2/7] add params save_bam_mapped, and MarkDuplicates bams even when no dbsnp --- main.nf | 95 ++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 61 insertions(+), 34 deletions(-) diff --git a/main.nf b/main.nf index e0184c2e65..671b214e22 100644 --- a/main.nf +++ b/main.nf @@ -1226,7 +1226,11 @@ process IndexBamFile { tag {idPatient + "-" + idSample} - publishDir "${params.outdir}/Preprocessing/${idSample}/Mapped", mode: params.publish_dir_mode + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { + if (params.save_bam_mapped) "Preprocessing/${idSample}/Mapped/${it}" + null + } input: set idPatient, idSample, file("${idSample}.bam") from bam_mapped_merged_to_index @@ -1243,6 +1247,8 @@ process IndexBamFile { """ } +if (!params.save_bam_mapped) tsv_bam_indexed.close() + (tsv_bam_indexed, tsv_bam_indexed_sample) = tsv_bam_indexed.into(2) // Creating a TSV file to restart from this step @@ -1281,11 +1287,10 @@ process MarkDuplicates { set idPatient, idSample, file("${idSample}.bam") from bam_mapped_merged output: - set idPatient, idSample, file("${idSample}.md.bam"), file("${idSample}.md.bam.bai") into duplicateMarkedBams + set idPatient, idSample, file("${idSample}.md.bam"), file("${idSample}.md.bam.bai") into bam_duplicates_marked + set idPatient, idSample into tsv_bam_duplicates_marked file ("${idSample}.bam.metrics") optional true into markDuplicatesReport - when: params.known_indels - script: markdup_java_options = task.memory.toGiga() > 8 ? params.markdup_java_options : "\"-Xms" + (task.memory.toGiga() / 2).trunc() + "g -Xmx" + (task.memory.toGiga() - 1) + "g\"" metrics = 'markduplicates' in skipQC ? '' : "-M ${idSample}.bam.metrics" @@ -1316,12 +1321,34 @@ process MarkDuplicates { """ } +(tsv_bam_duplicates_marked, tsv_bam_duplicates_marked_sample) = tsv_bam_duplicates_marked.into(2) + +// Creating a TSV file to restart from this step +tsv_bam_duplicates_marked.map { idPatient, idSample -> + gender = genderMap[idPatient] + status = statusMap[idPatient, idSample] + bam = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bam" + bai = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bam.bai" + "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" +}.collectFile( + name: 'duplicatemarked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" +) + +tsv_bam_duplicates_marked_sample + .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> + status = statusMap[idPatient, idSample] + gender = genderMap[idPatient] + bam = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bam" + bai = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bam.bai" + ["duplicatemarked_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] +} + if ('markduplicates' in skipQC) markDuplicatesReport.close() -duplicateMarkedBams = duplicateMarkedBams.dump(tag:'MD BAM') +bam_duplicates_marked = bam_duplicates_marked.dump(tag:'MD BAM') markDuplicatesReport = markDuplicatesReport.dump(tag:'MD Report') -(bamMD, bamMDToJoin) = duplicateMarkedBams.into(2) +(bamMD, bamMDToJoin, bam_duplicates_marked) = bam_duplicates_marked.into(3) bamBaseRecalibrator = bamMD.combine(intBaseRecalibrator) @@ -1518,7 +1545,7 @@ process ApplyBQSR { file(fastaFai) from ch_fai output: - set idPatient, idSample, file("${prefix}${idSample}.recal.bam") into bamMergeBamRecal + set idPatient, idSample, file("${prefix}${idSample}.recal.bam") into bam_recalibrated_to_merge script: prefix = params.no_intervals ? "" : "${intervalBed.baseName}_" @@ -1534,8 +1561,7 @@ process ApplyBQSR { """ } -bamMergeBamRecal = bamMergeBamRecal.groupTuple(by:[0, 1]) -(bamMergeBamRecal, bamMergeBamRecalNoInt) = bamMergeBamRecal.into(2) +(bam_recalibrated_to_merge, bam_recalibrated_to_index) = bam_recalibrated_to_merge.groupTuple(by:[0, 1]).into(2) // STEP 4': SENTIEON BQSR @@ -1635,12 +1661,12 @@ process MergeBamRecal { publishDir "${params.outdir}/Preprocessing/${idSample}/Recalibrated", mode: params.publish_dir_mode input: - set idPatient, idSample, file(bam) from bamMergeBamRecal + set idPatient, idSample, file(bam) from bam_recalibrated_to_merge output: - set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bamRecal - set idPatient, idSample, file("${idSample}.recal.bam") into bamRecalQC - set idPatient, idSample into bamRecalTSV + set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bam_recalibrated + set idPatient, idSample, file("${idSample}.recal.bam") into bam_recalibrated_qc + set idPatient, idSample into tsv_bam_recalibrated when: !(params.no_intervals) @@ -1653,7 +1679,7 @@ process MergeBamRecal { // STEP 4.5': INDEXING THE RECALIBRATED BAM FILES -process IndexBamRecal { +process Indexbam_recalibrated { label 'cpus_8' tag {idPatient + "-" + idSample} @@ -1661,12 +1687,12 @@ process IndexBamRecal { publishDir "${params.outdir}/Preprocessing/${idSample}/Recalibrated", mode: params.publish_dir_mode input: - set idPatient, idSample, file("${idSample}.recal.bam") from bamMergeBamRecalNoInt + set idPatient, idSample, file("${idSample}.recal.bam") from bam_recalibrated_to_index output: - set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bamRecalNoInt - set idPatient, idSample, file("${idSample}.recal.bam") into bamRecalQCnoInt - set idPatient, idSample into bamRecalTSVnoInt + set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bam_recalibrated_indexed + set idPatient, idSample, file("${idSample}.recal.bam") into bam_recalibrated_no_int_QC + set idPatient, idSample into tsv_bam_recalibrated_no_int when: params.no_intervals @@ -1676,15 +1702,15 @@ process IndexBamRecal { """ } -bamRecal = bamRecal.mix(bamRecalNoInt) -bamRecalQC = bamRecalQC.mix(bamRecalQCnoInt) -bamRecalTSV = bamRecalTSV.mix(bamRecalTSVnoInt) +bam_recalibrated = bam_recalibrated.mix(bam_recalibrated_indexed) +bam_recalibrated_QC = bam_recalibrated_QC.mix(bam_recalibrated_no_int_QC) +tsv_bam_recalibrated = tsv_bam_recalibrated.mix(tsv_bam_recalibrated_no_int) (bamRecalBamQC, bamRecalSamToolsStats) = bamRecalQC.into(2) -(bamRecalTSV, bamRecalSampleTSV) = bamRecalTSV.into(2) +(tsv_bam_recalibrated, tsv_bam_recalibrated_sample) = tsv_bam_recalibrated.into(2) // Creating a TSV file to restart from this step -bamRecalTSV.map { idPatient, idSample -> +tsv_bam_recalibrated.map { idPatient, idSample -> gender = genderMap[idPatient] status = statusMap[idPatient, idSample] bam = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam" @@ -1694,7 +1720,7 @@ bamRecalTSV.map { idPatient, idSample -> name: 'recalibrated.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" ) -bamRecalSampleTSV +tsv_bam_recalibrated_sample .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> status = statusMap[idPatient, idSample] @@ -1773,23 +1799,23 @@ bamQCReport = bamQCReport.dump(tag:'BamQC') ================================================================================ */ -// When using sentieon for mapping, Channel bamRecal is bamRecalSentieon -if (params.sentieon && step == 'mapping') bamRecal = bamRecalSentieon +// When using sentieon for mapping, Channel bam_recalibrated is bamRecalSentieon +if (params.sentieon && step == 'mapping') bam_recalibrated = bamRecalSentieon -// When no knownIndels for mapping, Channel bamRecal is bam_mapped_merged_indexed -bamRecal = (params.known_indels && step == 'mapping') ? bamRecal : bam_mapped_merged_indexed +// When no knownIndels for mapping, Channel bam_recalibrated is bam_duplicates_marked +bam_recalibrated = (params.known_indels && step == 'mapping') ? bam_recalibrated : bam_duplicates_marked -// When starting with variant calling, Channel bamRecal is inputSample -if (step == 'variantcalling') bamRecal = inputSample +// When starting with variant calling, Channel bam_recalibrated is inputSample +if (step == 'variantcalling') bam_recalibrated = inputSample -bamRecal = bamRecal.dump(tag:'BAM for Variant Calling') +bam_recalibrated = bam_recalibrated.dump(tag:'BAM for Variant Calling') // Here we have a recalibrated bam set // The TSV file is formatted like: "idPatient status idSample bamFile baiFile" // Manta will be run in Germline mode, or in Tumor mode depending on status // HaplotypeCaller, TIDDIT and Strelka will be run for Normal and Tumor samples -(bamSentieonDNAscope, bamSentieonDNAseq, bamMantaSingle, bamStrelkaSingle, bamTIDDIT, bamFreebayesSingleNoIntervals, bamHaplotypeCallerNoIntervals, bamRecalAll) = bamRecal.into(8) +(bamSentieonDNAscope, bamSentieonDNAseq, bamMantaSingle, bamStrelkaSingle, bamTIDDIT, bamFreebayesSingleNoIntervals, bamHaplotypeCallerNoIntervals, bamRecalAll) = bam_recalibrated.into(8) // To speed Variant Callers up we are chopping the reference into smaller pieces // Do variant calling by this intervals, and re-merge the VCFs @@ -2388,8 +2414,9 @@ process MergePileupSummaries { set idPatient, idSampleNormal, idSampleTumor, file("${idSampleTumor}_pileupsummaries.table") into mergedPileupFile when: 'mutect2' in tools + script: - allPileups = pileupSums.collect{ "-I ${it} " }.join(' ') + allPileups = pileupSums.collect{ "-I ${it} " }.join(' ') """ gatk --java-options "-Xmx${task.memory.toGiga()}g" \ GatherPileupSummaries \ @@ -2422,7 +2449,7 @@ process CalculateContamination { when: 'mutect2' in tools script: - """ + """ # calculate contamination gatk --java-options "-Xmx${task.memory.toGiga()}g" \ CalculateContamination \ From bed84ffbb3e15cf29301e3a90d0134a4a4e382f4 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 23 Apr 2020 13:54:45 +0200 Subject: [PATCH 3/7] typo --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 04e8cc3589..8ad37cb3ce 100644 --- a/main.nf +++ b/main.nf @@ -1693,7 +1693,7 @@ process MergeBamRecal { // STEP 4.5': INDEXING THE RECALIBRATED BAM FILES -process Indexbam_recalibrated { +process IndexBamRecal { label 'cpus_8' tag {idPatient + "-" + idSample} From 69a555ad107c2abe7d771a91f3067d1deeb18d7c Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 23 Apr 2020 14:04:05 +0200 Subject: [PATCH 4/7] UPPERCASE to lowercase --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 8ad37cb3ce..6f34ace2ce 100644 --- a/main.nf +++ b/main.nf @@ -1705,7 +1705,7 @@ process IndexBamRecal { output: set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bam_recalibrated_indexed - set idPatient, idSample, file("${idSample}.recal.bam") into bam_recalibrated_no_int_QC + set idPatient, idSample, file("${idSample}.recal.bam") into bam_recalibrated_no_int_qc set idPatient, idSample into tsv_bam_recalibrated_no_int when: params.no_intervals @@ -1717,7 +1717,7 @@ process IndexBamRecal { } bam_recalibrated = bam_recalibrated.mix(bam_recalibrated_indexed) -bam_recalibrated_QC = bam_recalibrated_QC.mix(bam_recalibrated_no_int_QC) +bam_recalibrated_qc = bam_recalibrated_qc.mix(bam_recalibrated_no_int_qc) tsv_bam_recalibrated = tsv_bam_recalibrated.mix(tsv_bam_recalibrated_no_int) (bamRecalBamQC, bamRecalSamToolsStats) = bamRecalQC.into(2) From ce327328b309c1d4e59ebe49ae48d31da9984c73 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 23 Apr 2020 14:10:55 +0200 Subject: [PATCH 5/7] fix channel names --- main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index 6f34ace2ce..2c3d2cf67a 100644 --- a/main.nf +++ b/main.nf @@ -1720,7 +1720,7 @@ bam_recalibrated = bam_recalibrated.mix(bam_recalibrated_indexed) bam_recalibrated_qc = bam_recalibrated_qc.mix(bam_recalibrated_no_int_qc) tsv_bam_recalibrated = tsv_bam_recalibrated.mix(tsv_bam_recalibrated_no_int) -(bamRecalBamQC, bamRecalSamToolsStats) = bamRecalQC.into(2) +(bam_recalibrated_bamqc, bam_recalibrated_samtools_stats) = bam_recalibrated_qc.into(2) (tsv_bam_recalibrated, tsv_bam_recalibrated_sample) = tsv_bam_recalibrated.into(2) // Creating a TSV file to restart from this step @@ -1754,7 +1754,7 @@ process SamtoolsStats { publishDir "${params.outdir}/Reports/${idSample}/SamToolsStats", mode: params.publish_dir_mode input: - set idPatient, idSample, file(bam) from bamRecalSamToolsStats + set idPatient, idSample, file(bam) from bam_recalibrated_samtools_stats output: file ("${bam}.samtools.stats.out") into samtoolsStatsReport @@ -1769,7 +1769,7 @@ process SamtoolsStats { samtoolsStatsReport = samtoolsStatsReport.dump(tag:'SAMTools') -bamBamQC = bamMappedBamQC.mix(bamRecalBamQC) +bamBamQC = bamMappedBamQC.mix(bam_recalibrated_bamqc) process BamQC { label 'memory_max' From e3493f6cb98f566ad4518f62162c377162ec0c19 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 23 Apr 2020 16:04:32 +0200 Subject: [PATCH 6/7] update CHANGELOG --- CHANGELOG.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bb6d2b89e9..792688770a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,8 +25,10 @@ Piellorieppe is one of the main massif in the Sarek National Park. - [#175](https://github.com/nf-core/sarek/pull/175) - Add `Sentieon` documentation - [#176](https://github.com/nf-core/sarek/pull/176) - Add empty `custom` genome in `genomes.config` to allow genomes that are not in `AWS iGenomes` - [#179](https://github.com/nf-core/sarek/pull/179) - Add `FreeBayes` germline variant calling -- [#180](https://github.com/nf-core/sarek/pull/180) - Now saving Mapped Bams (and creating TSV) in minimal setting +- [#180](https://github.com/nf-core/sarek/pull/180) - Now saving Mapped BAMs (and creating TSV) in minimal setting - [#182](https://github.com/nf-core/sarek/pull/182) - Add possibility to run `HaplotypeCaller` without `dbsnp` so it can be used to actually generate vcfs to build a set of known sites (cf [gatkforums](https://gatkforums.broadinstitute.org/gatk/discussion/1247/what-should-i-use-as-known-variants-sites-for-running-tool-x)) +- [#195](https://github.com/nf-core/sarek/pull/195) - Now creating TSV for duplicates marked BAMs in minimal setting +- [#195](https://github.com/nf-core/sarek/pull/195) - Add `--same_bam_mapped` params to save mapped BAMs. ### Changed - [2.6dev] @@ -55,7 +57,7 @@ Piellorieppe is one of the main massif in the Sarek National Park. - [#143](https://github.com/nf-core/sarek/pull/143) - Revert `snpEff` cache version to `86` for `GRCh38` - [#152](https://github.com/nf-core/sarek/pull/152), [#158](https://github.com/nf-core/sarek/pull/158), [#164](https://github.com/nf-core/sarek/pull/164), [#174](https://github.com/nf-core/sarek/pull/174), [#194](https://github.com/nf-core/sarek/pull/194) - Update docs - [#164](https://github.com/nf-core/sarek/pull/164) - Update `gatk4-spark` from `4.1.4.1` to `4.1.6.0` -- [#180](https://github.com/nf-core/sarek/pull/180) - Improve minimal setting +- [#180](https://github.com/nf-core/sarek/pull/180), [#195](https://github.com/nf-core/sarek/pull/195) - Improve minimal setting - [#183](https://github.com/nf-core/sarek/pull/183) - Update `input.md` documentation ### Fixed - [2.6dev] @@ -379,7 +381,7 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) ### Fixed - [2.3] -- [#720](https://github.com/SciLifeLab/Sarek/pull/720) - `bamQC` is now run on the recalibrated bams, and not after `MarkDuplicates` +- [#720](https://github.com/SciLifeLab/Sarek/pull/720) - `bamQC` is now run on the recalibrated BAMs, and not after `MarkDuplicates` - [#726](https://github.com/SciLifeLab/Sarek/pull/726) - Fix `Ascat` ref file input (one file can't be a set) - [#727](https://github.com/SciLifeLab/Sarek/pull/727) - `bamQC` outputs are no longer overwritten (name of dir is now the file instead of sample) - [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Fix issue with annotation that was consuming `cache` channels From 94814c96aa297d5ac67cca536d700f609ec52fb1 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Fri, 24 Apr 2020 09:16:37 +0200 Subject: [PATCH 7/7] spacing --- main.nf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/main.nf b/main.nf index 2c3d2cf67a..fdc6b36555 100644 --- a/main.nf +++ b/main.nf @@ -1092,10 +1092,10 @@ process MapReads { convertToFastq = hasExtension(inputFile1, "bam") ? "gatk --java-options -Xmx${task.memory.toGiga()}g SamToFastq --INPUT=${inputFile1} --FASTQ=/dev/stdout --INTERLEAVE=true --NON_PF=true | \\" : "" input = hasExtension(inputFile1, "bam") ? "-p /dev/stdin - 2> >(tee ${inputFile1}.bwa.stderr.log >&2)" : "${inputFile1} ${inputFile2}" """ - ${convertToFastq} - bwa mem -K 100000000 -R \"${readGroup}\" ${extra} -t ${task.cpus} -M ${fasta} \ - ${input} | \ - samtools sort --threads ${task.cpus} -m 2G - > ${idSample}_${idRun}.bam + ${convertToFastq} + bwa mem -K 100000000 -R \"${readGroup}\" ${extra} -t ${task.cpus} -M ${fasta} \ + ${input} | \ + samtools sort --threads ${task.cpus} -m 2G - > ${idSample}_${idRun}.bam """ } @@ -1147,7 +1147,7 @@ process Sentieon_MapReads { sentieon bwa mem -K 100000000 -R \"${readGroup}\" ${extra} -t ${task.cpus} -M ${fasta} \ ${inputFile1} ${inputFile2} | \ sentieon util sort -r ${fasta} -o ${idSample}_${idRun}.bam -t ${task.cpus} --sam2bam -i - - """ + """ } bam_sentieon_mapped = bam_sentieon_mapped.dump(tag:'Sentieon Mapped BAM')