diff --git a/CHANGELOG.md b/CHANGELOG.md index bb6d2b89e9..792688770a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,8 +25,10 @@ Piellorieppe is one of the main massif in the Sarek National Park. - [#175](https://github.com/nf-core/sarek/pull/175) - Add `Sentieon` documentation - [#176](https://github.com/nf-core/sarek/pull/176) - Add empty `custom` genome in `genomes.config` to allow genomes that are not in `AWS iGenomes` - [#179](https://github.com/nf-core/sarek/pull/179) - Add `FreeBayes` germline variant calling -- [#180](https://github.com/nf-core/sarek/pull/180) - Now saving Mapped Bams (and creating TSV) in minimal setting +- [#180](https://github.com/nf-core/sarek/pull/180) - Now saving Mapped BAMs (and creating TSV) in minimal setting - [#182](https://github.com/nf-core/sarek/pull/182) - Add possibility to run `HaplotypeCaller` without `dbsnp` so it can be used to actually generate vcfs to build a set of known sites (cf [gatkforums](https://gatkforums.broadinstitute.org/gatk/discussion/1247/what-should-i-use-as-known-variants-sites-for-running-tool-x)) +- [#195](https://github.com/nf-core/sarek/pull/195) - Now creating TSV for duplicates marked BAMs in minimal setting +- [#195](https://github.com/nf-core/sarek/pull/195) - Add `--same_bam_mapped` params to save mapped BAMs. ### Changed - [2.6dev] @@ -55,7 +57,7 @@ Piellorieppe is one of the main massif in the Sarek National Park. - [#143](https://github.com/nf-core/sarek/pull/143) - Revert `snpEff` cache version to `86` for `GRCh38` - [#152](https://github.com/nf-core/sarek/pull/152), [#158](https://github.com/nf-core/sarek/pull/158), [#164](https://github.com/nf-core/sarek/pull/164), [#174](https://github.com/nf-core/sarek/pull/174), [#194](https://github.com/nf-core/sarek/pull/194) - Update docs - [#164](https://github.com/nf-core/sarek/pull/164) - Update `gatk4-spark` from `4.1.4.1` to `4.1.6.0` -- [#180](https://github.com/nf-core/sarek/pull/180) - Improve minimal setting +- [#180](https://github.com/nf-core/sarek/pull/180), [#195](https://github.com/nf-core/sarek/pull/195) - Improve minimal setting - [#183](https://github.com/nf-core/sarek/pull/183) - Update `input.md` documentation ### Fixed - [2.6dev] @@ -379,7 +381,7 @@ Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) ### Fixed - [2.3] -- [#720](https://github.com/SciLifeLab/Sarek/pull/720) - `bamQC` is now run on the recalibrated bams, and not after `MarkDuplicates` +- [#720](https://github.com/SciLifeLab/Sarek/pull/720) - `bamQC` is now run on the recalibrated BAMs, and not after `MarkDuplicates` - [#726](https://github.com/SciLifeLab/Sarek/pull/726) - Fix `Ascat` ref file input (one file can't be a set) - [#727](https://github.com/SciLifeLab/Sarek/pull/727) - `bamQC` outputs are no longer overwritten (name of dir is now the file instead of sample) - [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Fix issue with annotation that was consuming `cache` channels diff --git a/docs/usage.md b/docs/usage.md index d3787f7f3b..10b2447410 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -34,6 +34,7 @@ - [--no_intervals](#--no_intervals) - [--target_bed](#--target_bed) - [--targetBED](#--targetbed) + - [--save_bam_mapped](#--save_bam_mapped) - [Reference genomes](#reference-genomes) - [--genome (using iGenomes)](#--genome-using-igenomes) - [--ac_loci](#--ac_loci) @@ -372,6 +373,10 @@ Use this to specify the target BED file for targeted or whole exome sequencing. > :warning: This params is deprecated -- it will be removed in a future release. > Please check: [`--target_bed`](#--target_bed) +### --save_bam_mapped + +Will save mapped BAMs. + ## Reference genomes The pipeline config files come bundled with paths to the Illumina iGenomes reference index files. diff --git a/main.nf b/main.nf index c2a45a73c3..fdc6b36555 100644 --- a/main.nf +++ b/main.nf @@ -1092,10 +1092,10 @@ process MapReads { convertToFastq = hasExtension(inputFile1, "bam") ? "gatk --java-options -Xmx${task.memory.toGiga()}g SamToFastq --INPUT=${inputFile1} --FASTQ=/dev/stdout --INTERLEAVE=true --NON_PF=true | \\" : "" input = hasExtension(inputFile1, "bam") ? "-p /dev/stdin - 2> >(tee ${inputFile1}.bwa.stderr.log >&2)" : "${inputFile1} ${inputFile2}" """ - ${convertToFastq} - bwa mem -K 100000000 -R \"${readGroup}\" ${extra} -t ${task.cpus} -M ${fasta} \ - ${input} | \ - samtools sort --threads ${task.cpus} -m 2G - > ${idSample}_${idRun}.bam + ${convertToFastq} + bwa mem -K 100000000 -R \"${readGroup}\" ${extra} -t ${task.cpus} -M ${fasta} \ + ${input} | \ + samtools sort --threads ${task.cpus} -m 2G - > ${idSample}_${idRun}.bam """ } @@ -1147,7 +1147,7 @@ process Sentieon_MapReads { sentieon bwa mem -K 100000000 -R \"${readGroup}\" ${extra} -t ${task.cpus} -M ${fasta} \ ${inputFile1} ${inputFile2} | \ sentieon util sort -r ${fasta} -o ${idSample}_${idRun}.bam -t ${task.cpus} --sam2bam -i - - """ + """ } bam_sentieon_mapped = bam_sentieon_mapped.dump(tag:'Sentieon Mapped BAM') @@ -1220,7 +1220,11 @@ process IndexBamFile { tag {idPatient + "-" + idSample} - publishDir "${params.outdir}/Preprocessing/${idSample}/Mapped", mode: params.publish_dir_mode + publishDir params.outdir, mode: params.publish_dir_mode, + saveAs: { + if (params.save_bam_mapped) "Preprocessing/${idSample}/Mapped/${it}" + null + } input: set idPatient, idSample, file("${idSample}.bam") from bam_mapped_merged_to_index @@ -1237,6 +1241,8 @@ process IndexBamFile { """ } +if (!params.save_bam_mapped) tsv_bam_indexed.close() + (tsv_bam_indexed, tsv_bam_indexed_sample) = tsv_bam_indexed.into(2) // Creating a TSV file to restart from this step @@ -1275,11 +1281,10 @@ process MarkDuplicates { set idPatient, idSample, file("${idSample}.bam") from bam_mapped_merged output: - set idPatient, idSample, file("${idSample}.md.bam"), file("${idSample}.md.bam.bai") into duplicateMarkedBams + set idPatient, idSample, file("${idSample}.md.bam"), file("${idSample}.md.bam.bai") into bam_duplicates_marked + set idPatient, idSample into tsv_bam_duplicates_marked file ("${idSample}.bam.metrics") optional true into markDuplicatesReport - when: params.known_indels - script: markdup_java_options = task.memory.toGiga() > 8 ? params.markdup_java_options : "\"-Xms" + (task.memory.toGiga() / 2).trunc() + "g -Xmx" + (task.memory.toGiga() - 1) + "g\"" metrics = 'markduplicates' in skipQC ? '' : "-M ${idSample}.bam.metrics" @@ -1310,12 +1315,34 @@ process MarkDuplicates { """ } +(tsv_bam_duplicates_marked, tsv_bam_duplicates_marked_sample) = tsv_bam_duplicates_marked.into(2) + +// Creating a TSV file to restart from this step +tsv_bam_duplicates_marked.map { idPatient, idSample -> + gender = genderMap[idPatient] + status = statusMap[idPatient, idSample] + bam = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bam" + bai = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bam.bai" + "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" +}.collectFile( + name: 'duplicatemarked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" +) + +tsv_bam_duplicates_marked_sample + .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> + status = statusMap[idPatient, idSample] + gender = genderMap[idPatient] + bam = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bam" + bai = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bam.bai" + ["duplicatemarked_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] +} + if ('markduplicates' in skipQC) markDuplicatesReport.close() -duplicateMarkedBams = duplicateMarkedBams.dump(tag:'MD BAM') +bam_duplicates_marked = bam_duplicates_marked.dump(tag:'MD BAM') markDuplicatesReport = markDuplicatesReport.dump(tag:'MD Report') -(bamMD, bamMDToJoin) = duplicateMarkedBams.into(2) +(bamMD, bamMDToJoin, bam_duplicates_marked) = bam_duplicates_marked.into(3) bamBaseRecalibrator = bamMD.combine(intBaseRecalibrator) @@ -1511,7 +1538,7 @@ process ApplyBQSR { file(fastaFai) from ch_fai output: - set idPatient, idSample, file("${prefix}${idSample}.recal.bam") into bamMergeBamRecal + set idPatient, idSample, file("${prefix}${idSample}.recal.bam") into bam_recalibrated_to_merge script: prefix = params.no_intervals ? "" : "${intervalBed.baseName}_" @@ -1527,8 +1554,7 @@ process ApplyBQSR { """ } -bamMergeBamRecal = bamMergeBamRecal.groupTuple(by:[0, 1]) -(bamMergeBamRecal, bamMergeBamRecalNoInt) = bamMergeBamRecal.into(2) +(bam_recalibrated_to_merge, bam_recalibrated_to_index) = bam_recalibrated_to_merge.groupTuple(by:[0, 1]).into(2) // STEP 4': SENTIEON BQSR @@ -1649,12 +1675,12 @@ process MergeBamRecal { publishDir "${params.outdir}/Preprocessing/${idSample}/Recalibrated", mode: params.publish_dir_mode input: - set idPatient, idSample, file(bam) from bamMergeBamRecal + set idPatient, idSample, file(bam) from bam_recalibrated_to_merge output: - set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bamRecal - set idPatient, idSample, file("${idSample}.recal.bam") into bamRecalQC - set idPatient, idSample into bamRecalTSV + set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bam_recalibrated + set idPatient, idSample, file("${idSample}.recal.bam") into bam_recalibrated_qc + set idPatient, idSample into tsv_bam_recalibrated when: !(params.no_intervals) @@ -1675,12 +1701,12 @@ process IndexBamRecal { publishDir "${params.outdir}/Preprocessing/${idSample}/Recalibrated", mode: params.publish_dir_mode input: - set idPatient, idSample, file("${idSample}.recal.bam") from bamMergeBamRecalNoInt + set idPatient, idSample, file("${idSample}.recal.bam") from bam_recalibrated_to_index output: - set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bamRecalNoInt - set idPatient, idSample, file("${idSample}.recal.bam") into bamRecalQCnoInt - set idPatient, idSample into bamRecalTSVnoInt + set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bam_recalibrated_indexed + set idPatient, idSample, file("${idSample}.recal.bam") into bam_recalibrated_no_int_qc + set idPatient, idSample into tsv_bam_recalibrated_no_int when: params.no_intervals @@ -1690,15 +1716,15 @@ process IndexBamRecal { """ } -bamRecal = bamRecal.mix(bamRecalNoInt) -bamRecalQC = bamRecalQC.mix(bamRecalQCnoInt) -bamRecalTSV = bamRecalTSV.mix(bamRecalTSVnoInt) +bam_recalibrated = bam_recalibrated.mix(bam_recalibrated_indexed) +bam_recalibrated_qc = bam_recalibrated_qc.mix(bam_recalibrated_no_int_qc) +tsv_bam_recalibrated = tsv_bam_recalibrated.mix(tsv_bam_recalibrated_no_int) -(bamRecalBamQC, bamRecalSamToolsStats) = bamRecalQC.into(2) -(bamRecalTSV, bamRecalSampleTSV) = bamRecalTSV.into(2) +(bam_recalibrated_bamqc, bam_recalibrated_samtools_stats) = bam_recalibrated_qc.into(2) +(tsv_bam_recalibrated, tsv_bam_recalibrated_sample) = tsv_bam_recalibrated.into(2) // Creating a TSV file to restart from this step -bamRecalTSV.map { idPatient, idSample -> +tsv_bam_recalibrated.map { idPatient, idSample -> gender = genderMap[idPatient] status = statusMap[idPatient, idSample] bam = "${params.outdir}/Preprocessing/${idSample}/Recalibrated/${idSample}.recal.bam" @@ -1708,7 +1734,7 @@ bamRecalTSV.map { idPatient, idSample -> name: 'recalibrated.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" ) -bamRecalSampleTSV +tsv_bam_recalibrated_sample .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> status = statusMap[idPatient, idSample] @@ -1728,7 +1754,7 @@ process SamtoolsStats { publishDir "${params.outdir}/Reports/${idSample}/SamToolsStats", mode: params.publish_dir_mode input: - set idPatient, idSample, file(bam) from bamRecalSamToolsStats + set idPatient, idSample, file(bam) from bam_recalibrated_samtools_stats output: file ("${bam}.samtools.stats.out") into samtoolsStatsReport @@ -1743,7 +1769,7 @@ process SamtoolsStats { samtoolsStatsReport = samtoolsStatsReport.dump(tag:'SAMTools') -bamBamQC = bamMappedBamQC.mix(bamRecalBamQC) +bamBamQC = bamMappedBamQC.mix(bam_recalibrated_bamqc) process BamQC { label 'memory_max' @@ -1787,23 +1813,23 @@ bamQCReport = bamQCReport.dump(tag:'BamQC') ================================================================================ */ -// When using sentieon for mapping, Channel bamRecal is bam_sentieon_recal -if (params.sentieon && step == 'mapping') bamRecal = bam_sentieon_recal +// When using sentieon for mapping, Channel bam_recalibrated is bam_sentieon_recal +if (params.sentieon && step == 'mapping') bam_recalibrated = bam_sentieon_recal -// When no knownIndels for mapping, Channel bamRecal is bam_mapped_merged_indexed -bamRecal = (params.known_indels && step == 'mapping') ? bamRecal : bam_mapped_merged_indexed +// When no knownIndels for mapping, Channel bam_recalibrated is bam_duplicates_marked +bam_recalibrated = (params.known_indels && step == 'mapping') ? bam_recalibrated : bam_duplicates_marked -// When starting with variant calling, Channel bamRecal is inputSample -if (step == 'variantcalling') bamRecal = inputSample +// When starting with variant calling, Channel bam_recalibrated is inputSample +if (step == 'variantcalling') bam_recalibrated = inputSample -bamRecal = bamRecal.dump(tag:'BAM for Variant Calling') +bam_recalibrated = bam_recalibrated.dump(tag:'BAM for Variant Calling') // Here we have a recalibrated bam set // The TSV file is formatted like: "idPatient status idSample bamFile baiFile" // Manta will be run in Germline mode, or in Tumor mode depending on status // HaplotypeCaller, TIDDIT and Strelka will be run for Normal and Tumor samples -(bamMantaSingle, bamStrelkaSingle, bamTIDDIT, bamFreebayesSingleNoIntervals, bamHaplotypeCallerNoIntervals, bamRecalAll) = bamRecal.into(6) +(bamMantaSingle, bamStrelkaSingle, bamTIDDIT, bamFreebayesSingleNoIntervals, bamHaplotypeCallerNoIntervals, bamRecalAll) = bam_recalibrated.into(6) (bam_sentieon_DNAseq, bam_sentieon_DNAscope, bam_sentieon_all) = bam_sentieon_deduped_table.into(3) @@ -2425,8 +2451,9 @@ process MergePileupSummaries { set idPatient, idSampleNormal, idSampleTumor, file("${idSampleTumor}_pileupsummaries.table") into mergedPileupFile when: 'mutect2' in tools + script: - allPileups = pileupSums.collect{ "-I ${it} " }.join(' ') + allPileups = pileupSums.collect{ "-I ${it} " }.join(' ') """ gatk --java-options "-Xmx${task.memory.toGiga()}g" \ GatherPileupSummaries \ @@ -2459,7 +2486,7 @@ process CalculateContamination { when: 'mutect2' in tools script: - """ + """ # calculate contamination gatk --java-options "-Xmx${task.memory.toGiga()}g" \ CalculateContamination \ diff --git a/nextflow.config b/nextflow.config index c9fdbfd836..3bf2d65302 100644 --- a/nextflow.config +++ b/nextflow.config @@ -23,6 +23,7 @@ params { // Results outdir = './results' publish_dir_mode = 'copy' // Default PublishDirMode (same as other nf-core pipelines) + save_bam_mapped = null // Mapped BAMs not saved // Modify fastqs (trim/split) trim_fastq = false // No trimming