diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c87e4a9c1..bb6d2b89e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -80,6 +80,7 @@ Piellorieppe is one of the main massif in the Sarek National Park. - [#164](https://github.com/nf-core/sarek/pull/164) - Fix issues when running with `Sentieon` - [#164](https://github.com/nf-core/sarek/pull/164) - Add more VCFs to annotation - [#167](https://github.com/nf-core/sarek/pull/167) - Add `--markdup_java_options` documentation to fix [#166](https://github.com/nf-core/sarek/issues/166) +- [#178](https://github.com/nf-core/sarek/pull/178) - Fix `Sentieon` variant calling, now using deduped bam files - [#188](https://github.com/nf-core/sarek/pull/188) - Fix input/output channels for process `IndexBamFile` to match actual files in the `mapped.tsv` files - [#189](https://github.com/nf-core/sarek/pull/189) - Fix `no_intervals` for process `HaplotypeCaller` (the file just need to actually exists...) diff --git a/main.nf b/main.nf index e0184c2e65..c2a45a73c3 100644 --- a/main.nf +++ b/main.nf @@ -391,13 +391,8 @@ if (params.input && (hasExtension(params.input, "vcf") || hasExtension(params.in // If no input file specified, trying to get TSV files corresponding to step in the TSV directory // only for steps recalibrate and variantCalling if (!params.input && step != 'mapping' && step != 'annotate') { - if (params.sentieon) { - if (step == 'variantcalling') tsvPath = "${params.outdir}/Preprocessing/TSV/recalibrated_sentieon.tsv" - else exit 1, "Not possible to restart from that step" - } - else { - tsvPath = step == 'recalibrate' ? "${params.outdir}/Preprocessing/TSV/duplicateMarked.tsv" : "${params.outdir}/Preprocessing/TSV/recalibrated.tsv" - } + if (step == 'recalibrate') tsvPath = params.sentieon ? "${params.outdir}/Preprocessing/TSV/sentieon_deduped.tsv" : "${params.outdir}/Preprocessing/TSV/duplicateMarked.tsv" + else tsvPath = params.sentieon ? "${params.outdir}/Preprocessing/TSV/sentieon_recalibrated.tsv" : "${params.outdir}/Preprocessing/TSV/recalibrated.tsv" } inputSample = Channel.empty() @@ -1062,9 +1057,9 @@ else inputPairReads = inputPairReads.mix(inputBam) inputPairReads = inputPairReads.dump(tag:'INPUT') -(inputPairReads, inputPairReadsSentieon) = inputPairReads.into(2) +(inputPairReads, input_pair_reads_sentieon) = inputPairReads.into(2) if (params.sentieon) inputPairReads.close() -else inputPairReadsSentieon.close() +else input_pair_reads_sentieon.close() process MapReads { label 'cpus_max' @@ -1119,7 +1114,7 @@ singleBam = singleBam.dump(tag:'Single BAM') // STEP 1': MAPPING READS TO REFERENCE GENOME WITH SENTIEON BWA MEM -process SentieonMapReads { +process Sentieon_MapReads { label 'cpus_max' label 'memory_max' label 'sentieon' @@ -1127,14 +1122,13 @@ process SentieonMapReads { tag {idPatient + "-" + idRun} input: - set idPatient, idSample, idRun, file(inputFile1), file(inputFile2) from inputPairReadsSentieon + set idPatient, idSample, idRun, file(inputFile1), file(inputFile2) from input_pair_reads_sentieon file(bwaIndex) from ch_bwa file(fasta) from ch_fasta file(fastaFai) from ch_fai output: - set idPatient, idSample, idRun, file("${idSample}_${idRun}.bam") into bamMappedSentieon - set idPatient, idSample, file("${idSample}_${idRun}.bam") into bamMappedSentieonBamQC + set idPatient, idSample, idRun, file("${idSample}_${idRun}.bam") into bam_sentieon_mapped when: params.sentieon @@ -1156,12 +1150,12 @@ process SentieonMapReads { """ } -bamMappedSentieon = bamMappedSentieon.dump(tag:'Sentieon Mapped BAM') +bam_sentieon_mapped = bam_sentieon_mapped.dump(tag:'Sentieon Mapped BAM') // Sort BAM whether they are standalone or should be merged singleBamSentieon = Channel.create() multipleBamSentieon = Channel.create() -bamMappedSentieon.groupTuple(by:[0, 1]) +bam_sentieon_mapped.groupTuple(by:[0, 1]) .choice(singleBamSentieon, multipleBamSentieon) {it[2].size() > 1 ? 1 : 0} singleBamSentieon = singleBamSentieon.map { idPatient, idSample, idRun, bam -> @@ -1194,13 +1188,13 @@ bam_mapped_merged = bam_mapped_merged.dump(tag:'Merged BAM') bam_mapped_merged = bam_mapped_merged.mix(singleBam,singleBamSentieon) -(bam_mapped_merged, mergedBamForSentieon) = bam_mapped_merged.into(2) +(bam_mapped_merged, bam_sentieon_mapped_merged) = bam_mapped_merged.into(2) -if (!params.sentieon) mergedBamForSentieon.close() +if (!params.sentieon) bam_sentieon_mapped_merged.close() else bam_mapped_merged.close() bam_mapped_merged = bam_mapped_merged.dump(tag:'BAMs for MD') -mergedBamForSentieon = mergedBamForSentieon.dump(tag:'Sentieon BAMs to Index') +bam_sentieon_mapped_merged = bam_sentieon_mapped_merged.dump(tag:'Sentieon BAMs to Index') process IndexBamMergedForSentieon { label 'cpus_8' @@ -1208,14 +1202,14 @@ process IndexBamMergedForSentieon { tag {idPatient + "-" + idSample} input: - set idPatient, idSample, file("${idSample}.bam") from mergedBamForSentieon + set idPatient, idSample, file("${idSample}.bam") from bam_sentieon_mapped_merged output: - set idPatient, idSample, file("${idSample}.bam"), file("${idSample}.bam.bai") into bamForSentieonDedup + set idPatient, idSample, file("${idSample}.bam"), file("${idSample}.bam.bai") into bam_sentieon_mapped_merged_indexed script: """ - samtools index ${bam} + samtools index ${idSample}.bam """ } @@ -1329,7 +1323,7 @@ bamBaseRecalibrator = bamBaseRecalibrator.dump(tag:'BAM FOR BASERECALIBRATOR') // STEP 2': SENTIEON DEDUP -process SentieonDedup { +process Sentieon_Dedup { label 'cpus_max' label 'memory_max' label 'sentieon' @@ -1340,17 +1334,16 @@ process SentieonDedup { saveAs: { if (it == "${idSample}_*.txt" && 'sentieon' in skipQC) null else if (it == "${idSample}_*.txt") "Reports/${idSample}/Sentieon/${it}" - else null + else "Preprocessing/${idSample}/DedupedSentieon/${it}" } input: - set idPatient, idSample, file(bam), file(bai) from bamForSentieonDedup + set idPatient, idSample, file(bam), file(bai) from bam_sentieon_mapped_merged_indexed file(fasta) from ch_fasta file(fastaFai) from ch_fai output: - set idPatient, idSample, file("${idSample}.deduped.bam"), file("${idSample}.deduped.bam.bai") into bamDedupedSentieon - file("${idSample}_*.txt") into bamDedupedSentieonQC + set idPatient, idSample, file("${idSample}.deduped.bam"), file("${idSample}.deduped.bam.bai") into bam_sentieon_dedup when: params.sentieon @@ -1539,9 +1532,9 @@ bamMergeBamRecal = bamMergeBamRecal.groupTuple(by:[0, 1]) // STEP 4': SENTIEON BQSR -bamDedupedSentieon = bamDedupedSentieon.dump(tag:'deduped.bam') +bam_sentieon_dedup = bam_sentieon_dedup.dump(tag:'deduped.bam') -process SentieonBQSR { +process Sentieon_BQSR { label 'cpus_max' label 'memory_max' label 'sentieon' @@ -1555,7 +1548,7 @@ process SentieonBQSR { } input: - set idPatient, idSample, file(bam), file(bai) from bamDedupedSentieon + set idPatient, idSample, file(bam), file(bai) from bam_sentieon_dedup file(dbsnp) from ch_dbsnp file(dbsnpIndex) from ch_dbsnp_tbi file(fasta) from ch_fasta @@ -1565,9 +1558,9 @@ process SentieonBQSR { file(knownIndelsIndex) from ch_known_indels_tbi output: - set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bamRecalSentieon - set idPatient, idSample into bamRecalSentieonTSV - file("${idSample}_recal_result.csv") into bamRecalSentieonQC + set idPatient, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bam.bai") into bam_sentieon_recal + set idPatient, idSample, file(bam), file(bai), file("${idSample}.recal.table") into bam_sentieon_deduped_table + set idPatient, idSample into tsv_sentieon when: params.sentieon @@ -1602,27 +1595,48 @@ process SentieonBQSR { """ } -(bamRecalSentieonTSV, bamRecalSentieonSampleTSV) = bamRecalSentieonTSV.into(2) +(tsv_sentieon_deduped, tsv_sentieon_deduped_sample, tsv_sentieon_recal, tsv_sentieon_recal_sample) = tsv_sentieon.into(4) // Creating a TSV file to restart from this step -bamRecalSentieonTSV.map { idPatient, idSample -> +tsv_sentieon_deduped.map { idPatient, idSample -> + gender = genderMap[idPatient] + status = statusMap[idPatient, idSample] + bam = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam" + bai = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam.bai" + table = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.table" + "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${table}\n" +}.collectFile( + name: 'sentieon_deduped.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" +) + +tsv_sentieon_deduped_sample + .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> + status = statusMap[idPatient, idSample] + gender = genderMap[idPatient] + bam = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam" + bai = "${params.outdir}/Preprocessing/${idSample}/DedupedSentieon/${idSample}.deduped.bam.bai" + table = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.table" + ["sentieon_deduped_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${table}\n"] +} + +// Creating a TSV file to restart from this step +tsv_sentieon_recal.map { idPatient, idSample -> gender = genderMap[idPatient] status = statusMap[idPatient, idSample] bam = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam" bai = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam.bai" "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n" }.collectFile( - name: 'recalibrated_sentieon.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" + name: 'sentieon_recalibrated.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" ) -bamRecalSentieonSampleTSV - .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { - idPatient, idSample -> +tsv_sentieon_recal_sample + .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { idPatient, idSample -> status = statusMap[idPatient, idSample] gender = genderMap[idPatient] bam = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam" bai = "${params.outdir}/Preprocessing/${idSample}/RecalSentieon/${idSample}.recal.bam.bai" - ["recalibrated_sentieon_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] + ["sentieon_recalibrated_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\n"] } // STEP 4.5: MERGING THE RECALIBRATED BAM FILES @@ -1773,8 +1787,8 @@ bamQCReport = bamQCReport.dump(tag:'BamQC') ================================================================================ */ -// When using sentieon for mapping, Channel bamRecal is bamRecalSentieon -if (params.sentieon && step == 'mapping') bamRecal = bamRecalSentieon +// When using sentieon for mapping, Channel bamRecal is bam_sentieon_recal +if (params.sentieon && step == 'mapping') bamRecal = bam_sentieon_recal // When no knownIndels for mapping, Channel bamRecal is bam_mapped_merged_indexed bamRecal = (params.known_indels && step == 'mapping') ? bamRecal : bam_mapped_merged_indexed @@ -1789,7 +1803,9 @@ bamRecal = bamRecal.dump(tag:'BAM for Variant Calling') // Manta will be run in Germline mode, or in Tumor mode depending on status // HaplotypeCaller, TIDDIT and Strelka will be run for Normal and Tumor samples -(bamSentieonDNAscope, bamSentieonDNAseq, bamMantaSingle, bamStrelkaSingle, bamTIDDIT, bamFreebayesSingleNoIntervals, bamHaplotypeCallerNoIntervals, bamRecalAll) = bamRecal.into(8) +(bamMantaSingle, bamStrelkaSingle, bamTIDDIT, bamFreebayesSingleNoIntervals, bamHaplotypeCallerNoIntervals, bamRecalAll) = bamRecal.into(6) + +(bam_sentieon_DNAseq, bam_sentieon_DNAscope, bam_sentieon_all) = bam_sentieon_deduped_table.into(3) // To speed Variant Callers up we are chopping the reference into smaller pieces // Do variant calling by this intervals, and re-merge the VCFs @@ -1880,7 +1896,7 @@ vcfGenotypeGVCFs = vcfGenotypeGVCFs.groupTuple(by:[0, 1, 2]) // STEP SENTIEON DNAseq -process SentieonDNAseq { +process Sentieon_DNAseq { label 'cpus_max' label 'memory_max' label 'sentieon' @@ -1888,14 +1904,14 @@ process SentieonDNAseq { tag {idSample} input: - set idPatient, idSample, file(bam), file(bai) from bamSentieonDNAseq + set idPatient, idSample, file(bam), file(bai), file(recal) from bam_sentieon_DNAseq file(dbsnp) from ch_dbsnp file(dbsnpIndex) from ch_dbsnp_tbi file(fasta) from ch_fasta file(fastaFai) from ch_fai output: - set val("SentieonDNAseq"), idPatient, idSample, file("DNAseq_${idSample}.vcf") into sentieonDNAseqVCF + set val("SentieonDNAseq"), idPatient, idSample, file("DNAseq_${idSample}.vcf") into vcf_sentieon_DNAseq when: 'dnaseq' in tools && params.sentieon @@ -1905,17 +1921,18 @@ process SentieonDNAseq { -t ${task.cpus} \ -r ${fasta} \ -i ${bam} \ - --algo Genotyper \ + -q ${recal} \ + --algo Haplotyper \ -d ${dbsnp} \ DNAseq_${idSample}.vcf """ } -sentieonDNAseqVCF = sentieonDNAseqVCF.dump(tag:'sentieon DNAseq') +vcf_sentieon_DNAseq = vcf_sentieon_DNAseq.dump(tag:'sentieon DNAseq') // STEP SENTIEON DNAscope -process SentieonDNAscope { +process Sentieon_DNAscope { label 'cpus_max' label 'memory_max' label 'sentieon' @@ -1923,15 +1940,15 @@ process SentieonDNAscope { tag {idSample} input: - set idPatient, idSample, file(bam), file(bai) from bamSentieonDNAscope + set idPatient, idSample, file(bam), file(bai), file(recal) from bam_sentieon_DNAscope file(dbsnp) from ch_dbsnp file(dbsnpIndex) from ch_dbsnp_tbi file(fasta) from ch_fasta file(fastaFai) from ch_fai output: - set val("SentieonDNAscope"), idPatient, idSample, file("DNAscope_${idSample}.vcf") into sentieonDNAscopeVCF - set val("SentieonDNAscope"), idPatient, idSample, file("DNAscope_SV_${idSample}.vcf") into sentieonDNAscopeSVVCF + set val("SentieonDNAscope"), idPatient, idSample, file("DNAscope_${idSample}.vcf") into vcf_sentieon_DNAscope + set val("SentieonDNAscope"), idPatient, idSample, file("DNAscope_SV_${idSample}.vcf") into vcf_sentieon_DNAscope_SV when: 'dnascope' in tools && params.sentieon @@ -1941,6 +1958,7 @@ process SentieonDNAscope { -t ${task.cpus} \ -r ${fasta} \ -i ${bam} \ + -q ${recal} \ --algo DNAscope \ -d ${dbsnp} \ DNAscope_${idSample}.vcf @@ -1949,6 +1967,7 @@ process SentieonDNAscope { -t ${task.cpus} \ -r ${fasta}\ -i ${bam} \ + -q ${recal} \ --algo DNAscope \ --var_type bnd \ -d ${dbsnp} \ @@ -1957,14 +1976,15 @@ process SentieonDNAscope { sentieon driver \ -t ${task.cpus} \ -r ${fasta}\ + -q ${recal} \ --algo SVSolver \ -v DNAscope_${idSample}.temp.vcf \ DNAscope_SV_${idSample}.vcf """ } -sentieonDNAscopeVCF = sentieonDNAscopeVCF.dump(tag:'sentieon DNAscope') -sentieonDNAscopeSVVCF = sentieonDNAscopeSVVCF.dump(tag:'sentieon DNAscope SV') +vcf_sentieon_DNAscope = vcf_sentieon_DNAscope.dump(tag:'sentieon DNAscope') +vcf_sentieon_DNAscope_SV = vcf_sentieon_DNAscope_SV.dump(tag:'sentieon DNAscope SV') // STEP STRELKA.1 - SINGLE MODE @@ -2169,7 +2189,24 @@ pairBam = bamNormal.cross(bamTumor).map { pairBam = pairBam.dump(tag:'BAM Somatic Pair') // Manta, Strelka, Mutect2, MSIsensor -(pairBamManta, pairBamStrelka, pairBamStrelkaBP, pairBamCalculateContamination, pairBamFilterMutect2, pairBamTNscope, pairBamMsisensor, pairBam) = pairBam.into(8) +(pairBamManta, pairBamStrelka, pairBamStrelkaBP, pairBamCalculateContamination, pairBamFilterMutect2, pairBamMsisensor, pairBam) = pairBam.into(7) + +// Making Pair Bam for Sention + +// separate BAM by status +bam_sention_normal = Channel.create() +bam_sentieon_tumor = Channel.create() + +bam_sentieon_all + .choice(bam_sentieon_tumor, bam_sention_normal) {statusMap[it[0], it[1]] == 0 ? 1 : 0} + +// Crossing Normal and Tumor to get a T/N pair for Somatic Variant Calling +// Remapping channel to remove common key idPatient + +bam_pair_sentieon_TNscope = bam_sention_normal.cross(bam_sentieon_tumor).map { + normal, tumor -> + [normal[0], normal[1], normal[2], normal[3], normal[4], tumor[1], tumor[2], tumor[3], tumor[4]] +} intervalPairBam = pairBam.spread(bedIntervals) @@ -2474,7 +2511,7 @@ process FilterMutect2Calls { // STEP SENTIEON TNSCOPE -process SentieonTNscope { +process Sentieon_TNscope { label 'cpus_max' label 'memory_max' label 'sentieon' @@ -2482,7 +2519,7 @@ process SentieonTNscope { tag {idSampleTumor + "_vs_" + idSampleNormal} input: - set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamTNscope + set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), file(recalNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(recalTumor) from bam_pair_sentieon_TNscope file(dict) from ch_dict file(fasta) from ch_fasta file(fastaFai) from ch_fai @@ -2490,7 +2527,7 @@ process SentieonTNscope { file(dbsnpIndex) from ch_dbsnp_tbi output: - set val("SentieonTNscope"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("*.vcf") into vcfTNscope + set val("SentieonTNscope"), idPatient, val("${idSampleTumor}_vs_${idSampleNormal}"), file("*.vcf") into vcf_sentieon_TNscope when: 'tnscope' in tools && params.sentieon @@ -2500,7 +2537,9 @@ process SentieonTNscope { -t ${task.cpus} \ -r ${fasta} \ -i ${bamTumor} \ + -q ${recalTumor} \ -i ${bamNormal} \ + -q ${recalNormal} \ --algo TNscope \ --tumor_sample ${idSampleTumor} \ --normal_sample ${idSampleNormal} \ @@ -2509,9 +2548,9 @@ process SentieonTNscope { """ } -vcfTNscope = vcfTNscope.dump(tag:'Sentieon TNscope') +vcf_sentieon_TNscope = vcf_sentieon_TNscope.dump(tag:'Sentieon TNscope') -sentieonVCF = sentieonDNAseqVCF.mix(sentieonDNAscopeVCF, sentieonDNAscopeSVVCF, vcfTNscope) +vcf_sentieon = vcf_sentieon_DNAseq.mix(vcf_sentieon_DNAscope, vcf_sentieon_DNAscope_SV, vcf_sentieon_TNscope) process CompressSentieonVCF { tag {"${idSample} - ${vcf}"} @@ -2519,12 +2558,10 @@ process CompressSentieonVCF { publishDir "${params.outdir}/VariantCalling/${idSample}/${variantCaller}", mode: params.publish_dir_mode input: - set variantCaller, idPatient, idSample, file(vcf) from sentieonVCF + set variantCaller, idPatient, idSample, file(vcf) from vcf_sentieon output: - set variantCaller, idPatient, idSample, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcfSentieon - - when: params.sentieon + set variantCaller, idPatient, idSample, file("*.vcf.gz"), file("*.vcf.gz.tbi") into vcf_sentieon_compressed script: """ @@ -2533,7 +2570,7 @@ process CompressSentieonVCF { """ } -vcfSentieon = vcfSentieon.dump(tag:'Sentieon VCF indexed') +vcf_sentieon_compressed = vcf_sentieon_compressed.dump(tag:'Sentieon VCF indexed') // STEP STRELKA.2 - SOMATIC PAIR @@ -3052,7 +3089,7 @@ vcfKeep = Channel.empty().mix( variantcaller, idPatient, idSample, vcf, tbi -> [variantcaller, idSample, vcf] }, - vcfSentieon.map { + vcf_sentieon_compressed.map { variantcaller, idPatient, idSample, vcf, tbi -> [variantcaller, idSample, vcf] },