From 5e0d034eaf6868ca0161d989140ad127d8a22aa6 Mon Sep 17 00:00:00 2001
From: MaxUlysse <max.u.garcia@gmail.com>
Date: Tue, 10 Dec 2019 15:11:03 +0100
Subject: [PATCH 01/10] add GATK4-spark possibilities

---
 main.nf         | 24 +++++++++++-------------
 nextflow.config |  1 -
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/main.nf b/main.nf
index 6973004aaf..2fb9bb5e1e 100644
--- a/main.nf
+++ b/main.nf
@@ -822,15 +822,14 @@ process IndexBamFile {
 
 // STEP 2: MARKING DUPLICATES
 
-process MarkDuplicates {
+process MarkDuplicatesSpark {
     label 'cpus_16'
 
     tag {idPatient + "-" + idSample}
 
     publishDir params.outdir, mode: params.publishDirMode,
         saveAs: {
-            if (it == "${idSample}.bam.metrics" && 'markduplicates' in skipQC) null
-            else if (it == "${idSample}.bam.metrics") "Reports/${idSample}/MarkDuplicates/${it}"
+            if (it == "${idSample}.bam.metrics") "Reports/${idSample}/MarkDuplicates/${it}"
             else "Preprocessing/${idSample}/DuplicateMarked/${it}"
         }
 
@@ -838,23 +837,22 @@ process MarkDuplicates {
         set idPatient, idSample, file("${idSample}.bam") from mergedBam
 
     output:
-        set idPatient, idSample, file("${idSample}.md.bam"), file("${idSample}.md.bai") into duplicateMarkedBams
-        file ("${idSample}.bam.metrics") into markDuplicatesReport
+        set idPatient, idSample, file("${idSample}.md.bam"), file("${idSample}.md.bam.bai") into duplicateMarkedBams
+        file ("${idSample}.bam.metrics") optional true into markDuplicatesReport
 
     when: step == 'mapping' && params.knownIndels
 
     script:
     markdup_java_options = task.memory.toGiga() > 8 ? params.markdup_java_options : "\"-Xms" +  (task.memory.toGiga() / 2).trunc() + "g -Xmx" + (task.memory.toGiga() - 1) + "g\""
+    metrics = 'markduplicates' in skipQC ? '' : "-M ${idSample}.bam.metrics"
     """
     gatk --java-options ${markdup_java_options} \
-        MarkDuplicates \
-        --MAX_RECORDS_IN_RAM 50000 \
-        --INPUT ${idSample}.bam \
-        --METRICS_FILE ${idSample}.bam.metrics \
-        --TMP_DIR . \
-        --ASSUME_SORT_ORDER coordinate \
-        --CREATE_INDEX true \
-        --OUTPUT ${idSample}.md.bam
+        MarkDuplicatesSpark \
+        -I ${idSample}.bam \
+        -O ${idSample}.md.bam \
+        ${metrics} \
+        --tmp-dir . \
+        --create-output-bam-index true
     """
 }
 
diff --git a/nextflow.config b/nextflow.config
index 6d4d86ed5f..fde1b6f3b8 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -112,7 +112,6 @@ profiles {
     docker {
       enabled = true
       fixOwnership = true
-      runOptions = "-u \$(id -u):\$(id -g)"
     }
     singularity.enabled = false
   }

From 3b29c785f072e70172f2b713b8c654f0cf88737f Mon Sep 17 00:00:00 2001
From: MaxUlysse <max.u.garcia@gmail.com>
Date: Tue, 10 Dec 2019 15:16:58 +0100
Subject: [PATCH 02/10] update docs and containers

---
 CHANGELOG.md       | 3 +++
 docs/containers.md | 3 ++-
 docs/output.md     | 6 +++---
 environment.yml    | 2 +-
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a52ada6b0f..b5e0a50814 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) a
 - [#60](https://github.com/nf-core/sarek/pull/60) - Add new minimal genomes (`TAIR10`, `EB2`, `UMD3.1`, `bosTau8`, `WBcel235`, `ce10`, `CanFam3.1`, `canFam3`, `GRCz10`, `danRer10`, `BDGP6`, `dm6`, `EquCab2`, `equCab2`, `EB1`, `Galgal4`, `galGal4`, `Gm01`, `hg38`, `hg19`, `Mmul_1`, `mm10`, `IRGSP-1.0`, `CHIMP2.1.4`, `panTro4`, `Rnor_6.0`, `rn6`, `R64-1-1`, `sacCer3`, `EF2`, `Sbi1`, `Sscrofa10.2`, `susScr3`, `AGPv3`) to `igenomes.config`
 - [#61](https://github.com/nf-core/sarek/pull/61) - Add params `split_fastq`
 - [#61](https://github.com/nf-core/sarek/pull/61) - Add test `SPLITFASTQ`
+- [#66](https://github.com/nf-core/sarek/pull/66) - Add `Sentieon` possibilities to Sarek
+- [#76](https://github.com/nf-core/sarek/pull/76) - Add `GATK Spark` possibilities to Sarek
 
 ### `Changed`
 
@@ -38,6 +40,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) a
 - [#74](https://github.com/nf-core/sarek/pull/74) - Update docs
 - [#74](https://github.com/nf-core/sarek/pull/74) - Improve CI tests (both Jenkins and GitHub actions tests)
 - [#74](https://github.com/nf-core/sarek/pull/74) - Move all ci from `ci-extra.yml` to `ci.yml`
+- [#76](https://github.com/nf-core/sarek/pull/76) - Use `MarkDuplicatesSpark` instead of `MarkDuplicates`
 
 ### `Removed`
 
diff --git a/docs/containers.md b/docs/containers.md
index 6dfd8edae2..96d3c0c2af 100644
--- a/docs/containers.md
+++ b/docs/containers.md
@@ -21,7 +21,8 @@ For annotation, the main container can be used, but the cache has to be download
 - Contain **[Control-FREEC](https://github.com/BoevaLab/FREEC)** 11.5
 - Contain **[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/)** 0.11.8
 - Contain **[FreeBayes](https://github.com/ekg/freebayes)** 1.3.1
-- Contain **[GATK4](https://github.com/broadinstitute/gatk)** 4.1.4.0
+- Contain **[GATK4](https://github.com/broadinstitute/gatk)** 4.1.4.1
+- Contain **[GATK4-spark](https://github.com/broadinstitute/gatk)** 4.1.4.1
 - Contain **[GeneSplicer](https://ccb.jhu.edu/software/genesplicer/)** 1.0
 - Contain **[HTSlib](https://github.com/samtools/htslib)** 1.9
 - Contain **[Manta](https://github.com/Illumina/manta)** 1.6.0
diff --git a/docs/output.md b/docs/output.md
index 4d4813dbc3..dda76b9072 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -10,7 +10,7 @@ The pipeline processes data using the following steps:
   - [Map to Reference](#map-to-reference)
     - [BWA mem](#bwa-mem)
   - [Mark Duplicates](#mark-duplicates)
-    - [GATK MarkDuplicates](#gatk-markduplicates)
+    - [GATK MarkDuplicatesSpark](#gatk-markduplicatesspark)
   - [Base (Quality Score) Recalibration](#base-quality-score-recalibration)
     - [GATK BaseRecalibrator](#gatk-baserecalibrator)
     - [GATK ApplyBQSR](#gatk-applybqsr)
@@ -66,9 +66,9 @@ Such files are intermediate and not kept in the final files delivered to users.
 
 ### Mark Duplicates
 
-#### GATK MarkDuplicates
+#### GATK MarkDuplicatesSpark
 
-[GATK MarkDuplicates](https://software.broadinstitute.org/gatk/documentation/tooldocs/4.1.4.0/picard_sam_markduplicates_MarkDuplicates.php) locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA.
+[GATK MarkDuplicatesSpark](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_spark_transforms_markduplicates_MarkDuplicatesSpark.php) is a Spark implementation of [Picard MarkDuplicates](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/picard_sam_markduplicates_MarkDuplicates.php) and locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA.
 
 This directory is the location for the BAM files delivered to users.
 Besides the duplicate marked BAM files, the recalibration tables (`*.recal.table`) are also stored, and can be used to create base recalibrated files.
diff --git a/environment.yml b/environment.yml
index 56e57e8f0a..1350b15716 100644
--- a/environment.yml
+++ b/environment.yml
@@ -14,7 +14,7 @@ dependencies:
   - ensembl-vep=98.2
   - fastqc=0.11.8
   - freebayes=1.3.1
-  - gatk4=4.1.4.0
+  - gatk4-spark=4.1.4.1
   - genesplicer=1.0
   - htslib=1.9
   - manta=1.6.0

From b17ce672a2d579ecb79011ab885522aaa2428ba1 Mon Sep 17 00:00:00 2001
From: MaxUlysse <max.u.garcia@gmail.com>
Date: Tue, 10 Dec 2019 15:22:01 +0100
Subject: [PATCH 03/10] update CHANGELOG

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b5e0a50814..48154ff601 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -41,11 +41,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) a
 - [#74](https://github.com/nf-core/sarek/pull/74) - Improve CI tests (both Jenkins and GitHub actions tests)
 - [#74](https://github.com/nf-core/sarek/pull/74) - Move all ci from `ci-extra.yml` to `ci.yml`
 - [#76](https://github.com/nf-core/sarek/pull/76) - Use `MarkDuplicatesSpark` instead of `MarkDuplicates`
+- [#76](https://github.com/nf-core/sarek/pull/76) - Use `gatk4-spark` instead of `gatk4` in `environment.yml`
 
 ### `Removed`
 
 - [#46](https://github.com/nf-core/sarek/pull/46) - Remove mention of old `build.nf` script which was included in `main.nf`
 - [#74](https://github.com/nf-core/sarek/pull/74) - Remove `download_image.sh` and `run_tests.sh` scripts
+- [#76](https://github.com/nf-core/sarek/pull/76) - Remove `runOptions = "-u \$(id -u):\$(id -g)"` in `nextflow.config` to enable `Spark` possibilities
 
 ### `Fixed`
 

From 428b51f132a9b8a7d5d8f64e65678c27c0acaf4e Mon Sep 17 00:00:00 2001
From: MaxUlysse <max.u.garcia@gmail.com>
Date: Tue, 10 Dec 2019 15:33:06 +0100
Subject: [PATCH 04/10] fix TSV

---
 main.nf | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/main.nf b/main.nf
index 7acbdf0bde..d9c966f330 100644
--- a/main.nf
+++ b/main.nf
@@ -1130,7 +1130,7 @@ recalTableTSV.map { idPatient, idSample ->
     status = statusMap[idPatient, idSample]
     gender = genderMap[idPatient]
     bam = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bam"
-    bai = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bai"
+    bai = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bam.bai"
     recalTable = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.recal.table"
     "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n"
 }.collectFile(
@@ -1143,7 +1143,7 @@ recalTableSampleTSV
         status = statusMap[idPatient, idSample]
         gender = genderMap[idPatient]
         bam = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bam"
-        bai = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bai"
+        bai = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bam.bai"
         recalTable = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.recal.table"
         ["duplicateMarked_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n"]
 }
@@ -1153,12 +1153,10 @@ bamApplyBQSR = bamMDToJoin.join(recalTable, by:[0,1])
 if (step == 'recalibrate') bamApplyBQSR = inputSample
 
 bamApplyBQSR = bamApplyBQSR.dump(tag:'BAM + BAI + RECAL TABLE')
-// [DUMP: recal.table] ['normal', 'normal', normal.md.bam, normal.md.bai, normal.recal.table]
 
 bamApplyBQSR = bamApplyBQSR.combine(intApplyBQSR)
 
 bamApplyBQSR = bamApplyBQSR.dump(tag:'BAM + BAI + RECAL TABLE + INT')
-// [DUMP: BAM + BAI + RECAL TABLE + INT] ['normal', 'normal', normal.md.bam, normal.md.bai, normal.recal.table, 1_1-200000.bed]
 
 // STEP 4: RECALIBRATING
 

From e894dc9aee827b585fad4fb6054c276a1c4215f1 Mon Sep 17 00:00:00 2001
From: MaxUlysse <max.u.garcia@gmail.com>
Date: Tue, 10 Dec 2019 16:28:42 +0100
Subject: [PATCH 05/10] fix syntax for IndexFeatureFile

---
 main.nf | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index d9c966f330..627c224e7d 100644
--- a/main.nf
+++ b/main.nf
@@ -1513,7 +1513,8 @@ process GenotypeGVCFs {
     // Using -L is important for speed and we have to index the interval files also
     """
     gatk --java-options -Xmx${task.memory.toGiga()}g \
-        IndexFeatureFile -F ${gvcf}
+        IndexFeatureFile \
+        -I ${gvcf}
 
     gatk --java-options -Xmx${task.memory.toGiga()}g \
         GenotypeGVCFs \

From 88aa09f3835f00574048895971af6fd1519d179e Mon Sep 17 00:00:00 2001
From: MaxUlysse <max.u.garcia@gmail.com>
Date: Tue, 10 Dec 2019 17:02:01 +0100
Subject: [PATCH 06/10] typo

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 89ee9f46d5..11d5eec8ad 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -125,6 +125,6 @@ jobs:
         run: |
           docker pull nfcore/sarek:dev
           docker tag nfcore/sarek:dev nfcore/sarek:dev
-      - name: Run vriant calling test on specific tools
+      - name: Run variant calling test on specific tools
         run: |
           nextflow run . -profile test_tool,docker --verbose --tools ${{ matrix.tool }}
\ No newline at end of file

From 312d7331ada630483737cc5ca6e7c73f06e139b4 Mon Sep 17 00:00:00 2001
From: MaxUlysse <max.u.garcia@gmail.com>
Date: Tue, 17 Dec 2019 13:44:22 +0100
Subject: [PATCH 07/10] update docs

---
 docs/output.md | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/docs/output.md b/docs/output.md
index fb00e4f329..fd5a2c80c5 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -510,9 +510,13 @@ For more information about how to use Qualimap bamqc reports, see [Qualimap bamq
 
 #### MarkDuplicates reports
 
-[GATK MarkDuplicates](https://github.com/broadinstitute/gatk) locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA.
-Duplicates can arise during sample preparation e.g.
-library construction using PCR.
+[[GATK MarkDuplicatesSpark](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_spark_transforms_markduplicates_MarkDuplicatesSpark.php), Spark implementation of [Picard MarkDuplicates](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/picard_sam_markduplicates_MarkDuplicates.php)
+) locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA.
+
+Collecting duplicate metrics slows down performance.
+To disable them use `--skipQC MarkDuplicates`.
+
+Duplicates can arise during sample preparation _e.g._ library construction using PCR.
 Duplicate reads can also result from a single amplification cluster, incorrectly detected as multiple clusters by the optical sensor of the sequencing instrument.
 These duplication artifacts are referred to as optical duplicates.
 

From 861c8b1d43716c5fa41b8c2c3d201ffab836dae5 Mon Sep 17 00:00:00 2001
From: MaxUlysse <max.u.garcia@gmail.com>
Date: Tue, 17 Dec 2019 14:27:53 +0100
Subject: [PATCH 08/10] better ponIndex

---
 main.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index 627c224e7d..2addb5e3cd 100644
--- a/main.nf
+++ b/main.nf
@@ -557,7 +557,7 @@ process BuildPonIndex {
     """
 }
 
-ch_ponIndex = params.pon_index ? Channel.value(file(params.pon_index)) : ponIndexBuilt
+ch_ponIndex = params.pon ? params.pon_index ? Channel.value(file(params.pon_index)) : ponIndexBuilt : "null"
 
 process BuildIntervals {
   tag {fastaFai}
@@ -1437,7 +1437,7 @@ bamRecal = (params.knownIndels && step == 'mapping') ? bamRecal : indexedBam
 // When starting with variant calling, Channel bamRecal is inputSample
 if (step == 'variantcalling') bamRecal = inputSample
 
-bamRecal = bamRecal.dump(tag:'BAM')
+bamRecal = bamRecal.dump(tag:'BAM for Variant Calling')
 
 // Here we have a recalibrated bam set
 // The TSV file is formatted like: "idPatient status idSample bamFile baiFile"

From 530f6580aa4620da74e2b75ed0b8dc1f0b68f29a Mon Sep 17 00:00:00 2001
From: MaxUlysse <max.u.garcia@gmail.com>
Date: Tue, 17 Dec 2019 14:38:53 +0100
Subject: [PATCH 09/10] trying to have nicer tests

---
 .github/workflows/ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c178671d56..704f3fd48e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -105,7 +105,7 @@ jobs:
         run: |
           docker pull nfcore/sarek:dev
           docker tag nfcore/sarek:dev nfcore/sarek:dev
-      - name: Run targeted and splitfastq tests
+      - name: Run ${{ matrix.profile }} test
         run: |
           nextflow run . -profile ${{ matrix.profile }},docker --verbose
   tools:
@@ -125,6 +125,6 @@ jobs:
         run: |
           docker pull nfcore/sarek:dev
           docker tag nfcore/sarek:dev nfcore/sarek:dev
-      - name: Run variant calling test on specific tools
+      - name: Run variant calling test on ${{ matrix.tool }}
         run: |
           nextflow run . -profile test_tool,docker --verbose --tools ${{ matrix.tool }}

From a93e10b58787deebe109e3ce9d25d911e0a241ba Mon Sep 17 00:00:00 2001
From: MaxUlysse <max.u.garcia@gmail.com>
Date: Tue, 17 Dec 2019 14:41:45 +0100
Subject: [PATCH 10/10] update CHANGELOG

---
 .github/workflows/ci.yml | 2 +-
 CHANGELOG.md             | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 704f3fd48e..25fcef4914 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -125,6 +125,6 @@ jobs:
         run: |
           docker pull nfcore/sarek:dev
           docker tag nfcore/sarek:dev nfcore/sarek:dev
-      - name: Run variant calling test on ${{ matrix.tool }}
+      - name: Run ${{ matrix.tool }} test
         run: |
           nextflow run . -profile test_tool,docker --verbose --tools ${{ matrix.tool }}
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 73479d48b5..ca00e11eed 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) a
 
 ## dev
 
+### `Added`
+
+- [#76](https://github.com/nf-core/sarek/pull/76) - Add `GATK Spark` possibilities to Sarek
+
 ### `Changed`
 
 - [#76](https://github.com/nf-core/sarek/pull/76) - Use `MarkDuplicatesSpark` instead of `MarkDuplicates`
@@ -30,7 +34,6 @@ Jåkkåtjkaskajekna is one of the two glaciers of the Ålkatj Massif.
 - [#61](https://github.com/nf-core/sarek/pull/61) - Add params `split_fastq`
 - [#61](https://github.com/nf-core/sarek/pull/61) - Add test `SPLITFASTQ`
 - [#66](https://github.com/nf-core/sarek/pull/66) - Add `Sentieon` possibilities to Sarek
-- [#76](https://github.com/nf-core/sarek/pull/76) - Add `GATK Spark` possibilities to Sarek
 
 ### `Changed`