From 5e0d034eaf6868ca0161d989140ad127d8a22aa6 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 10 Dec 2019 15:11:03 +0100 Subject: [PATCH 01/10] add GATK4-spark possibilities --- main.nf | 24 +++++++++++------------- nextflow.config | 1 - 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/main.nf b/main.nf index 6973004aaf..2fb9bb5e1e 100644 --- a/main.nf +++ b/main.nf @@ -822,15 +822,14 @@ process IndexBamFile { // STEP 2: MARKING DUPLICATES -process MarkDuplicates { +process MarkDuplicatesSpark { label 'cpus_16' tag {idPatient + "-" + idSample} publishDir params.outdir, mode: params.publishDirMode, saveAs: { - if (it == "${idSample}.bam.metrics" && 'markduplicates' in skipQC) null - else if (it == "${idSample}.bam.metrics") "Reports/${idSample}/MarkDuplicates/${it}" + if (it == "${idSample}.bam.metrics") "Reports/${idSample}/MarkDuplicates/${it}" else "Preprocessing/${idSample}/DuplicateMarked/${it}" } @@ -838,23 +837,22 @@ process MarkDuplicates { set idPatient, idSample, file("${idSample}.bam") from mergedBam output: - set idPatient, idSample, file("${idSample}.md.bam"), file("${idSample}.md.bai") into duplicateMarkedBams - file ("${idSample}.bam.metrics") into markDuplicatesReport + set idPatient, idSample, file("${idSample}.md.bam"), file("${idSample}.md.bam.bai") into duplicateMarkedBams + file ("${idSample}.bam.metrics") optional true into markDuplicatesReport when: step == 'mapping' && params.knownIndels script: markdup_java_options = task.memory.toGiga() > 8 ? params.markdup_java_options : "\"-Xms" + (task.memory.toGiga() / 2).trunc() + "g -Xmx" + (task.memory.toGiga() - 1) + "g\"" + metrics = 'markduplicates' in skipQC ? '' : "-M ${idSample}.bam.metrics" """ gatk --java-options ${markdup_java_options} \ - MarkDuplicates \ - --MAX_RECORDS_IN_RAM 50000 \ - --INPUT ${idSample}.bam \ - --METRICS_FILE ${idSample}.bam.metrics \ - --TMP_DIR . \ - --ASSUME_SORT_ORDER coordinate \ - --CREATE_INDEX true \ - --OUTPUT ${idSample}.md.bam + MarkDuplicatesSpark \ + -I ${idSample}.bam \ + -O ${idSample}.md.bam \ + ${metrics} \ + --tmp-dir . \ + --create-output-bam-index true """ } diff --git a/nextflow.config b/nextflow.config index 6d4d86ed5f..fde1b6f3b8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -112,7 +112,6 @@ profiles { docker { enabled = true fixOwnership = true - runOptions = "-u \$(id -u):\$(id -g)" } singularity.enabled = false } From 3b29c785f072e70172f2b713b8c654f0cf88737f Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 10 Dec 2019 15:16:58 +0100 Subject: [PATCH 02/10] update docs and containers --- CHANGELOG.md | 3 +++ docs/containers.md | 3 ++- docs/output.md | 6 +++--- environment.yml | 2 +- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a52ada6b0f..b5e0a50814 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) a - [#60](https://github.com/nf-core/sarek/pull/60) - Add new minimal genomes (`TAIR10`, `EB2`, `UMD3.1`, `bosTau8`, `WBcel235`, `ce10`, `CanFam3.1`, `canFam3`, `GRCz10`, `danRer10`, `BDGP6`, `dm6`, `EquCab2`, `equCab2`, `EB1`, `Galgal4`, `galGal4`, `Gm01`, `hg38`, `hg19`, `Mmul_1`, `mm10`, `IRGSP-1.0`, `CHIMP2.1.4`, `panTro4`, `Rnor_6.0`, `rn6`, `R64-1-1`, `sacCer3`, `EF2`, `Sbi1`, `Sscrofa10.2`, `susScr3`, `AGPv3`) to `igenomes.config` - [#61](https://github.com/nf-core/sarek/pull/61) - Add params `split_fastq` - [#61](https://github.com/nf-core/sarek/pull/61) - Add test `SPLITFASTQ` +- [#66](https://github.com/nf-core/sarek/pull/66) - Add `Sentieon` possibilities to Sarek +- [#76](https://github.com/nf-core/sarek/pull/76) - Add `GATK Spark` possibilities to Sarek ### `Changed` @@ -38,6 +40,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) a - [#74](https://github.com/nf-core/sarek/pull/74) - Update docs - [#74](https://github.com/nf-core/sarek/pull/74) - Improve CI tests (both Jenkins and GitHub actions tests) - [#74](https://github.com/nf-core/sarek/pull/74) - Move all ci from `ci-extra.yml` to `ci.yml` +- [#76](https://github.com/nf-core/sarek/pull/76) - Use `MarkDuplicatesSpark` instead of `MarkDuplicates` ### `Removed` diff --git a/docs/containers.md b/docs/containers.md index 6dfd8edae2..96d3c0c2af 100644 --- a/docs/containers.md +++ b/docs/containers.md @@ -21,7 +21,8 @@ For annotation, the main container can be used, but the cache has to be download - Contain **[Control-FREEC](https://github.com/BoevaLab/FREEC)** 11.5 - Contain **[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/)** 0.11.8 - Contain **[FreeBayes](https://github.com/ekg/freebayes)** 1.3.1 -- Contain **[GATK4](https://github.com/broadinstitute/gatk)** 4.1.4.0 +- Contain **[GATK4](https://github.com/broadinstitute/gatk)** 4.1.4.1 +- Contain **[GATK4-spark](https://github.com/broadinstitute/gatk)** 4.1.4.1 - Contain **[GeneSplicer](https://ccb.jhu.edu/software/genesplicer/)** 1.0 - Contain **[HTSlib](https://github.com/samtools/htslib)** 1.9 - Contain **[Manta](https://github.com/Illumina/manta)** 1.6.0 diff --git a/docs/output.md b/docs/output.md index 4d4813dbc3..dda76b9072 100644 --- a/docs/output.md +++ b/docs/output.md @@ -10,7 +10,7 @@ The pipeline processes data using the following steps: - [Map to Reference](#map-to-reference) - [BWA mem](#bwa-mem) - [Mark Duplicates](#mark-duplicates) - - [GATK MarkDuplicates](#gatk-markduplicates) + - [GATK MarkDuplicatesSpark](#gatk-markduplicatesspark) - [Base (Quality Score) Recalibration](#base-quality-score-recalibration) - [GATK BaseRecalibrator](#gatk-baserecalibrator) - [GATK ApplyBQSR](#gatk-applybqsr) @@ -66,9 +66,9 @@ Such files are intermediate and not kept in the final files delivered to users. ### Mark Duplicates -#### GATK MarkDuplicates +#### GATK MarkDuplicatesSpark -[GATK MarkDuplicates](https://software.broadinstitute.org/gatk/documentation/tooldocs/4.1.4.0/picard_sam_markduplicates_MarkDuplicates.php) locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA. +[GATK MarkDuplicatesSpark](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_spark_transforms_markduplicates_MarkDuplicatesSpark.php) is a Spark implementation of [Picard MarkDuplicates](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/picard_sam_markduplicates_MarkDuplicates.php) and locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA. This directory is the location for the BAM files delivered to users. Besides the duplicate marked BAM files, the recalibration tables (`*.recal.table`) are also stored, and can be used to create base recalibrated files. diff --git a/environment.yml b/environment.yml index 56e57e8f0a..1350b15716 100644 --- a/environment.yml +++ b/environment.yml @@ -14,7 +14,7 @@ dependencies: - ensembl-vep=98.2 - fastqc=0.11.8 - freebayes=1.3.1 - - gatk4=4.1.4.0 + - gatk4-spark=4.1.4.1 - genesplicer=1.0 - htslib=1.9 - manta=1.6.0 From b17ce672a2d579ecb79011ab885522aaa2428ba1 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 10 Dec 2019 15:22:01 +0100 Subject: [PATCH 03/10] update CHANGELOG --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b5e0a50814..48154ff601 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,11 +41,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) a - [#74](https://github.com/nf-core/sarek/pull/74) - Improve CI tests (both Jenkins and GitHub actions tests) - [#74](https://github.com/nf-core/sarek/pull/74) - Move all ci from `ci-extra.yml` to `ci.yml` - [#76](https://github.com/nf-core/sarek/pull/76) - Use `MarkDuplicatesSpark` instead of `MarkDuplicates` +- [#76](https://github.com/nf-core/sarek/pull/76) - Use `gatk4-spark` instead of `gatk4` in `environment.yml` ### `Removed` - [#46](https://github.com/nf-core/sarek/pull/46) - Remove mention of old `build.nf` script which was included in `main.nf` - [#74](https://github.com/nf-core/sarek/pull/74) - Remove `download_image.sh` and `run_tests.sh` scripts +- [#76](https://github.com/nf-core/sarek/pull/76) - Remove `runOptions = "-u \$(id -u):\$(id -g)"` in `nextflow.config` to enable `Spark` possibilities ### `Fixed` From 428b51f132a9b8a7d5d8f64e65678c27c0acaf4e Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 10 Dec 2019 15:33:06 +0100 Subject: [PATCH 04/10] fix TSV --- main.nf | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index 7acbdf0bde..d9c966f330 100644 --- a/main.nf +++ b/main.nf @@ -1130,7 +1130,7 @@ recalTableTSV.map { idPatient, idSample -> status = statusMap[idPatient, idSample] gender = genderMap[idPatient] bam = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bai" + bai = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bam.bai" recalTable = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.recal.table" "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n" }.collectFile( @@ -1143,7 +1143,7 @@ recalTableSampleTSV status = statusMap[idPatient, idSample] gender = genderMap[idPatient] bam = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bam" - bai = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bai" + bai = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.md.bam.bai" recalTable = "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${idSample}.recal.table" ["duplicateMarked_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${bam}\t${bai}\t${recalTable}\n"] } @@ -1153,12 +1153,10 @@ bamApplyBQSR = bamMDToJoin.join(recalTable, by:[0,1]) if (step == 'recalibrate') bamApplyBQSR = inputSample bamApplyBQSR = bamApplyBQSR.dump(tag:'BAM + BAI + RECAL TABLE') -// [DUMP: recal.table] ['normal', 'normal', normal.md.bam, normal.md.bai, normal.recal.table] bamApplyBQSR = bamApplyBQSR.combine(intApplyBQSR) bamApplyBQSR = bamApplyBQSR.dump(tag:'BAM + BAI + RECAL TABLE + INT') -// [DUMP: BAM + BAI + RECAL TABLE + INT] ['normal', 'normal', normal.md.bam, normal.md.bai, normal.recal.table, 1_1-200000.bed] // STEP 4: RECALIBRATING From e894dc9aee827b585fad4fb6054c276a1c4215f1 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 10 Dec 2019 16:28:42 +0100 Subject: [PATCH 05/10] fix syntax for IndexFeatureFile --- main.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index d9c966f330..627c224e7d 100644 --- a/main.nf +++ b/main.nf @@ -1513,7 +1513,8 @@ process GenotypeGVCFs { // Using -L is important for speed and we have to index the interval files also """ gatk --java-options -Xmx${task.memory.toGiga()}g \ - IndexFeatureFile -F ${gvcf} + IndexFeatureFile \ + -I ${gvcf} gatk --java-options -Xmx${task.memory.toGiga()}g \ GenotypeGVCFs \ From 88aa09f3835f00574048895971af6fd1519d179e Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 10 Dec 2019 17:02:01 +0100 Subject: [PATCH 06/10] typo --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 89ee9f46d5..11d5eec8ad 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -125,6 +125,6 @@ jobs: run: | docker pull nfcore/sarek:dev docker tag nfcore/sarek:dev nfcore/sarek:dev - - name: Run vriant calling test on specific tools + - name: Run variant calling test on specific tools run: | nextflow run . -profile test_tool,docker --verbose --tools ${{ matrix.tool }} \ No newline at end of file From 312d7331ada630483737cc5ca6e7c73f06e139b4 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 17 Dec 2019 13:44:22 +0100 Subject: [PATCH 07/10] update docs --- docs/output.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/output.md b/docs/output.md index fb00e4f329..fd5a2c80c5 100644 --- a/docs/output.md +++ b/docs/output.md @@ -510,9 +510,13 @@ For more information about how to use Qualimap bamqc reports, see [Qualimap bamq #### MarkDuplicates reports -[GATK MarkDuplicates](https://github.com/broadinstitute/gatk) locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA. -Duplicates can arise during sample preparation e.g. -library construction using PCR. +[[GATK MarkDuplicatesSpark](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_spark_transforms_markduplicates_MarkDuplicatesSpark.php), Spark implementation of [Picard MarkDuplicates](https://software.broadinstitute.org/gatk/documentation/tooldocs/current/picard_sam_markduplicates_MarkDuplicates.php) +) locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA. + +Collecting duplicate metrics slows down performance. +To disable them use `--skipQC MarkDuplicates`. + +Duplicates can arise during sample preparation _e.g._ library construction using PCR. Duplicate reads can also result from a single amplification cluster, incorrectly detected as multiple clusters by the optical sensor of the sequencing instrument. These duplication artifacts are referred to as optical duplicates. From 861c8b1d43716c5fa41b8c2c3d201ffab836dae5 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 17 Dec 2019 14:27:53 +0100 Subject: [PATCH 08/10] better ponIndex --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 627c224e7d..2addb5e3cd 100644 --- a/main.nf +++ b/main.nf @@ -557,7 +557,7 @@ process BuildPonIndex { """ } -ch_ponIndex = params.pon_index ? Channel.value(file(params.pon_index)) : ponIndexBuilt +ch_ponIndex = params.pon ? params.pon_index ? Channel.value(file(params.pon_index)) : ponIndexBuilt : "null" process BuildIntervals { tag {fastaFai} @@ -1437,7 +1437,7 @@ bamRecal = (params.knownIndels && step == 'mapping') ? bamRecal : indexedBam // When starting with variant calling, Channel bamRecal is inputSample if (step == 'variantcalling') bamRecal = inputSample -bamRecal = bamRecal.dump(tag:'BAM') +bamRecal = bamRecal.dump(tag:'BAM for Variant Calling') // Here we have a recalibrated bam set // The TSV file is formatted like: "idPatient status idSample bamFile baiFile" From 530f6580aa4620da74e2b75ed0b8dc1f0b68f29a Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 17 Dec 2019 14:38:53 +0100 Subject: [PATCH 09/10] trying to have nicer tests --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c178671d56..704f3fd48e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -105,7 +105,7 @@ jobs: run: | docker pull nfcore/sarek:dev docker tag nfcore/sarek:dev nfcore/sarek:dev - - name: Run targeted and splitfastq tests + - name: Run ${{ matrix.profile }} test run: | nextflow run . -profile ${{ matrix.profile }},docker --verbose tools: @@ -125,6 +125,6 @@ jobs: run: | docker pull nfcore/sarek:dev docker tag nfcore/sarek:dev nfcore/sarek:dev - - name: Run variant calling test on specific tools + - name: Run variant calling test on ${{ matrix.tool }} run: | nextflow run . -profile test_tool,docker --verbose --tools ${{ matrix.tool }} From a93e10b58787deebe109e3ce9d25d911e0a241ba Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Tue, 17 Dec 2019 14:41:45 +0100 Subject: [PATCH 10/10] update CHANGELOG --- .github/workflows/ci.yml | 2 +- CHANGELOG.md | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 704f3fd48e..25fcef4914 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -125,6 +125,6 @@ jobs: run: | docker pull nfcore/sarek:dev docker tag nfcore/sarek:dev nfcore/sarek:dev - - name: Run variant calling test on ${{ matrix.tool }} + - name: Run ${{ matrix.tool }} test run: | nextflow run . -profile test_tool,docker --verbose --tools ${{ matrix.tool }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 73479d48b5..ca00e11eed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) a ## dev +### `Added` + +- [#76](https://github.com/nf-core/sarek/pull/76) - Add `GATK Spark` possibilities to Sarek + ### `Changed` - [#76](https://github.com/nf-core/sarek/pull/76) - Use `MarkDuplicatesSpark` instead of `MarkDuplicates` @@ -30,7 +34,6 @@ Jåkkåtjkaskajekna is one of the two glaciers of the Ålkatj Massif. - [#61](https://github.com/nf-core/sarek/pull/61) - Add params `split_fastq` - [#61](https://github.com/nf-core/sarek/pull/61) - Add test `SPLITFASTQ` - [#66](https://github.com/nf-core/sarek/pull/66) - Add `Sentieon` possibilities to Sarek -- [#76](https://github.com/nf-core/sarek/pull/76) - Add `GATK Spark` possibilities to Sarek ### `Changed`