diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c178671d56..25fcef4914 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -105,7 +105,7 @@ jobs: run: | docker pull nfcore/sarek:dev docker tag nfcore/sarek:dev nfcore/sarek:dev - - name: Run targeted and splitfastq tests + - name: Run ${{ matrix.profile }} test run: | nextflow run . -profile ${{ matrix.profile }},docker --verbose tools: @@ -125,6 +125,6 @@ jobs: run: | docker pull nfcore/sarek:dev docker tag nfcore/sarek:dev nfcore/sarek:dev - - name: Run variant calling test on specific tools + - name: Run ${{ matrix.tool }} test run: | nextflow run . -profile test_tool,docker --verbose --tools ${{ matrix.tool }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 3cdb1cc196..ca00e11eed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,14 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) a ## dev +### `Added` + +- [#76](https://github.com/nf-core/sarek/pull/76) - Add `GATK Spark` possibilities to Sarek + +### `Changed` + +- [#76](https://github.com/nf-core/sarek/pull/76) - Use `MarkDuplicatesSpark` instead of `MarkDuplicates` +- [#76](https://github.com/nf-core/sarek/pull/76) - Use `gatk4-spark` instead of `gatk4` in `environment.yml` - [#80](https://github.com/nf-core/sarek/pull/80) - Re-bump `dev` branch ## [2.5.2] - Jåkkåtjkaskajekna @@ -25,6 +33,7 @@ Jåkkåtjkaskajekna is one of the two glaciers of the Ålkatj Massif. - [#60](https://github.com/nf-core/sarek/pull/60) - Add new minimal genomes (`TAIR10`, `EB2`, `UMD3.1`, `bosTau8`, `WBcel235`, `ce10`, `CanFam3.1`, `canFam3`, `GRCz10`, `danRer10`, `BDGP6`, `dm6`, `EquCab2`, `equCab2`, `EB1`, `Galgal4`, `galGal4`, `Gm01`, `hg38`, `hg19`, `Mmul_1`, `mm10`, `IRGSP-1.0`, `CHIMP2.1.4`, `panTro4`, `Rnor_6.0`, `rn6`, `R64-1-1`, `sacCer3`, `EF2`, `Sbi1`, `Sscrofa10.2`, `susScr3`, `AGPv3`) to `igenomes.config` - [#61](https://github.com/nf-core/sarek/pull/61) - Add params `split_fastq` - [#61](https://github.com/nf-core/sarek/pull/61) - Add test `SPLITFASTQ` +- [#66](https://github.com/nf-core/sarek/pull/66) - Add `Sentieon` possibilities to Sarek ### `Changed` @@ -41,6 +50,7 @@ Jåkkåtjkaskajekna is one of the two glaciers of the Ålkatj Massif. - [#46](https://github.com/nf-core/sarek/pull/46) - Remove mention of old `build.nf` script which was included in `main.nf` - [#74](https://github.com/nf-core/sarek/pull/74) - Remove `download_image.sh` and `run_tests.sh` scripts +- [#76](https://github.com/nf-core/sarek/pull/76) - Remove `runOptions = "-u \$(id -u):\$(id -g)"` in `nextflow.config` to enable `Spark` possibilities ### `Fixed` diff --git a/docs/containers.md b/docs/containers.md index 16e6291612..ddf6de64e8 100644 --- a/docs/containers.md +++ b/docs/containers.md @@ -21,7 +21,7 @@ For annotation, the main container can be used, but the cache has to be download - Contain **[Control-FREEC](https://github.com/BoevaLab/FREEC)** 11.4 - Contain **[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/)** 0.11.8 - Contain **[FreeBayes](https://github.com/ekg/freebayes)** 1.2.0 -- Contain **[GATK4](https://github.com/broadinstitute/gatk)** 4.1.2.0 +- Contain **[GATK4-spark](https://github.com/broadinstitute/gatk)** 4.1.4.1 - Contain **[GeneSplicer](https://ccb.jhu.edu/software/genesplicer/)** 1.0 - Contain **[HTSlib](https://github.com/samtools/htslib)** 1.9 - Contain **[Manta](https://github.com/Illumina/manta)** 1.5.0 diff --git a/docs/output.md b/docs/output.md index 4908f032ad..fd5a2c80c5 100644 --- a/docs/output.md +++ b/docs/output.md @@ -10,7 +10,7 @@ The pipeline processes data using the following steps: - [Map to Reference](#map-to-reference) - [BWA mem](#bwa-mem) - [Mark Duplicates](#mark-duplicates) - - [GATK MarkDuplicates](#gatk-markduplicates) + - [GATK MarkDuplicatesSpark](#gatk-markduplicatesspark) - [Base (Quality Score) Recalibration](#base-quality-score-recalibration) - [GATK BaseRecalibrator](#gatk-baserecalibrator) - [GATK ApplyBQSR](#gatk-applybqsr) @@ -66,9 +66,9 @@ Such files are intermediate and not kept in the final files delivered to users. ### Mark Duplicates -#### GATK MarkDuplicates +#### GATK MarkDuplicatesSpark -[GATK MarkDuplicates](https://software.broadinstitute.org/gatk/documentation/tooldocs/4.1.4.0/picard_sam_markduplicates_MarkDuplicates.php) locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA. +[GATK MarkDuplicatesSpark](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_spark_transforms_markduplicates_MarkDuplicatesSpark.php) is a Spark implementation of [Picard MarkDuplicates](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/picard_sam_markduplicates_MarkDuplicates.php) and locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA. This directory is the location for the BAM files delivered to users. Besides the duplicate marked BAM files, the recalibration tables (`*.recal.table`) are also stored, and can be used to create base recalibrated files. @@ -510,9 +510,13 @@ For more information about how to use Qualimap bamqc reports, see [Qualimap bamq #### MarkDuplicates reports -[GATK MarkDuplicates](https://github.com/broadinstitute/gatk) locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA. -Duplicates can arise during sample preparation e.g. -library construction using PCR. +[[GATK MarkDuplicatesSpark](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_spark_transforms_markduplicates_MarkDuplicatesSpark.php), Spark implementation of [Picard MarkDuplicates](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/picard_sam_markduplicates_MarkDuplicates.php) +) locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA. + +Collecting duplicate metrics slows down performance. +To disable them use `--skipQC MarkDuplicates`. + +Duplicates can arise during sample preparation _e.g._ library construction using PCR. Duplicate reads can also result from a single amplification cluster, incorrectly detected as multiple clusters by the optical sensor of the sequencing instrument. These duplication artifacts are referred to as optical duplicates. diff --git a/environment.yml b/environment.yml index 7b7b1bf897..2eebdb797c 100644 --- a/environment.yml +++ b/environment.yml @@ -14,7 +14,7 @@ dependencies: - ensembl-vep=95.2 - fastqc=0.11.8 - freebayes=1.2.0 - - gatk4=4.1.2.0 + - gatk4-spark=4.1.4.1 - genesplicer=1.0 - htslib=1.9 - manta=1.5.0 diff --git a/main.nf b/main.nf index 2ef926a2cb..2addb5e3cd 100644 --- a/main.nf +++ b/main.nf @@ -557,7 +557,7 @@ process BuildPonIndex { """ } -ch_ponIndex = params.pon_index ? Channel.value(file(params.pon_index)) : ponIndexBuilt +ch_ponIndex = params.pon ? params.pon_index ? Channel.value(file(params.pon_index)) : ponIndexBuilt : "null" process BuildIntervals { tag {fastaFai} @@ -940,15 +940,14 @@ process IndexBamFile { // STEP 2: MARKING DUPLICATES -process MarkDuplicates { +process MarkDuplicatesSpark { label 'cpus_16' tag {idPatient + "-" + idSample} publishDir params.outdir, mode: params.publishDirMode, saveAs: { - if (it == "${idSample}.bam.metrics" && 'markduplicates' in skipQC) null - else if (it == "${idSample}.bam.metrics") "Reports/${idSample}/MarkDuplicates/${it}" + if (it == "${idSample}.bam.metrics") "Reports/${idSample}/MarkDuplicates/${it}" else "Preprocessing/${idSample}/DuplicateMarked/${it}" } @@ -956,23 +955,22 @@ process MarkDuplicates { set idPatient, idSample, file("${idSample}.bam") from mergedBam output: - set idPatient, idSample, file("${idSample}.md.bam"), file("${idSample}.md.bai") into duplicateMarkedBams - file ("${idSample}.bam.metrics") into markDuplicatesReport + set idPatient, idSample, file("${idSample}.md.bam"), file("${idSample}.md.bam.bai") into duplicateMarkedBams + file ("${idSample}.bam.metrics") optional true into markDuplicatesReport when: params.knownIndels script: markdup_java_options = task.memory.toGiga() > 8 ? params.markdup_java_options : "\"-Xms" + (task.memory.toGiga() / 2).trunc() + "g -Xmx" + (task.memory.toGiga() - 1) + "g\"" + metrics = 'markduplicates' in skipQC ? '' : "-M ${idSample}.bam.metrics" """ gatk --java-options ${markdup_java_options} \ - MarkDuplicates \ - --MAX_RECORDS_IN_RAM 50000 \ - --INPUT ${idSample}.bam \ - --METRICS_FILE ${idSample}.bam.metrics \ - --TMP_DIR . \ - --ASSUME_SORT_ORDER coordinate \ - --CREATE_INDEX true \ - --OUTPUT ${idSample}.md.bam + MarkDuplicatesSpark \ + -I ${idSample}.bam \ + -O ${idSample}.md.bam \ + ${metrics} \ + --tmp-dir . \ + --create-output-bam-index true """ } @@ -1132,7 +1130,7 @@ recalTableTSV.map { idPatient, idSample -> status = statusMap[idPatient, idSample] gender = genderMap[idPatient] bam = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bai" + bai = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bam.bai" recalTable = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.recal.table" "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n" }.collectFile( @@ -1145,7 +1143,7 @@ recalTableSampleTSV status = statusMap[idPatient, idSample] gender = genderMap[idPatient] bam = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bai" + bai = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bam.bai" recalTable = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.recal.table" ["duplicateMarked_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n"] } @@ -1155,12 +1153,10 @@ bamApplyBQSR = bamMDToJoin.join(recalTable, by:[0,1]) if (step == 'recalibrate') bamApplyBQSR = inputSample bamApplyBQSR = bamApplyBQSR.dump(tag:'BAM + BAI + RECAL TABLE') -// [DUMP: recal.table] ['normal', 'normal', normal.md.bam, normal.md.bai, normal.recal.table] bamApplyBQSR = bamApplyBQSR.combine(intApplyBQSR) bamApplyBQSR = bamApplyBQSR.dump(tag:'BAM + BAI + RECAL TABLE + INT') -// [DUMP: BAM + BAI + RECAL TABLE + INT] ['normal', 'normal', normal.md.bam, normal.md.bai, normal.recal.table, 1_1-200000.bed] // STEP 4: RECALIBRATING @@ -1441,7 +1437,7 @@ bamRecal = (params.knownIndels && step == 'mapping') ? bamRecal : indexedBam // When starting with variant calling, Channel bamRecal is inputSample if (step == 'variantcalling') bamRecal = inputSample -bamRecal = bamRecal.dump(tag:'BAM') +bamRecal = bamRecal.dump(tag:'BAM for Variant Calling') // Here we have a recalibrated bam set // The TSV file is formatted like: "idPatient status idSample bamFile baiFile" @@ -1517,7 +1513,8 @@ process GenotypeGVCFs { // Using -L is important for speed and we have to index the interval files also """ gatk --java-options -Xmx${task.memory.toGiga()}g \ - IndexFeatureFile -F ${gvcf} + IndexFeatureFile \ + -I ${gvcf} gatk --java-options -Xmx${task.memory.toGiga()}g \ GenotypeGVCFs \ diff --git a/nextflow.config b/nextflow.config index 674bad8460..c82f5fab40 100644 --- a/nextflow.config +++ b/nextflow.config @@ -117,7 +117,6 @@ profiles { docker { enabled = true fixOwnership = true - runOptions = "-u \$(id -u):\$(id -g)" } singularity.enabled = false }