This repository has been archived by the owner on May 13, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 9
/
mitochondria-pipeline.wdl
363 lines (326 loc) · 11.7 KB
/
mitochondria-pipeline.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
## Copyright Broad Institute, 2019
##
## Purpose :
## Workflow for SNP and INDEL variant calling on mitochondria.
##
## Requirements/expectations :
## - BAM or CRAM file
## - Median of the coverage over the autosome (if available, another central statistic would work too)
##
## Output :
## - A VCF file and its index
## - Addtional Metrics
## - Optional realigned to chrM BAM
##
## Software version notes :
## - GATK 4.1
## - Cromwell version support
## - Successfully tested on v37
## - Does not work on versions < v23 due to output syntax
##
## LICENSING :
## This script is released under the WDL source code license (BSD-3) (see LICENSE in
## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
## be subject to different licenses. Users are responsible for checking that they are
## authorized to run all programs before running this script. Please see the docker
## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
## licensing information pertaining to the included programs.
version 1.0
import "https://raw.githubusercontent.com/gatk-workflows/gatk4-mitochondria-pipeline/1.1.0/tasks/align-and-call.wdl" as AlignAndCall
workflow MitochondriaPipeline {
meta {
description: "Takes in fully aligned hg38 bam and outputs VCF of SNP/Indel calls on the mitochondria."
}
input {
File wgs_aligned_input_bam_or_cram
File wgs_aligned_input_bam_or_cram_index
String contig_name = "chrM"
Float? autosomal_coverage
# Read length used for optimization only. If this is too small CollectWgsMetrics might fail, but the results are not
# affected by this number. Default is 151.
Int? max_read_length
# Full reference is only requred if starting with a CRAM (BAM doesn't need these files)
File? ref_fasta
File? ref_fasta_index
File? ref_dict
File mt_dict
File mt_fasta
File mt_fasta_index
File mt_amb
File mt_ann
File mt_bwt
File mt_pac
File mt_sa
File blacklisted_sites
File blacklisted_sites_index
#Shifted reference is used for calling the control region (edge of mitochondria reference).
#This solves the problem that BWA doesn't support alignment to circular contigs.
File mt_shifted_dict
File mt_shifted_fasta
File mt_shifted_fasta_index
File mt_shifted_amb
File mt_shifted_ann
File mt_shifted_bwt
File mt_shifted_pac
File mt_shifted_sa
File blacklisted_sites_shifted
File blacklisted_sites_shifted_index
File shift_back_chain
File control_region_shifted_reference_interval_list
File non_control_region_interval_list
File? gatk_override
String? m2_extra_args
String? m2_filter_extra_args
Float? vaf_filter_threshold
Float? f_score_beta
Boolean compress_output_vcf = false
#Optional runtime arguments
Int? preemptible_tries
}
parameter_meta {
wgs_aligned_input_bam_or_cram: "Full WGS hg38 bam or cram"
autosomal_coverage: "Median coverage of full input bam"
out_vcf: "Final VCF of mitochondrial SNPs and INDELs"
vaf_filter_threshold: "Hard threshold for filtering low VAF sites"
f_score_beta: "F-Score beta balances the filtering strategy between recall and precision. The relative weight of recall to precision."
contig_name: "Name of mitochondria contig in reference that wgs_aligned_input_bam_or_cram is aligned to"
}
call SubsetBamToChrM {
input:
input_bam = wgs_aligned_input_bam_or_cram,
input_bai = wgs_aligned_input_bam_or_cram_index,
contig_name = contig_name,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
ref_dict = ref_dict,
gatk_override = gatk_override,
preemptible_tries = preemptible_tries
}
call RevertSam {
input:
input_bam = SubsetBamToChrM.output_bam,
preemptible_tries = preemptible_tries
}
call AlignAndCall.AlignAndCall as AlignAndCall {
input:
unmapped_bam = RevertSam.unmapped_bam,
autosomal_coverage = autosomal_coverage,
mt_dict = mt_dict,
mt_fasta = mt_fasta,
mt_fasta_index = mt_fasta_index,
mt_amb = mt_amb,
mt_ann = mt_ann,
mt_bwt = mt_bwt,
mt_pac = mt_pac,
mt_sa = mt_sa,
blacklisted_sites = blacklisted_sites,
blacklisted_sites_index = blacklisted_sites_index,
mt_shifted_dict = mt_shifted_dict,
mt_shifted_fasta = mt_shifted_fasta,
mt_shifted_fasta_index = mt_shifted_fasta_index,
mt_shifted_amb = mt_shifted_amb,
mt_shifted_ann = mt_shifted_ann,
mt_shifted_bwt = mt_shifted_bwt,
mt_shifted_pac = mt_shifted_pac,
mt_shifted_sa = mt_shifted_sa,
blacklisted_sites_shifted = blacklisted_sites_shifted,
blacklisted_sites_shifted_index = blacklisted_sites_shifted_index,
shift_back_chain = shift_back_chain,
gatk_override = gatk_override,
m2_extra_args = m2_extra_args,
m2_filter_extra_args = m2_filter_extra_args,
vaf_filter_threshold = vaf_filter_threshold,
f_score_beta = f_score_beta,
compress_output_vcf = compress_output_vcf,
max_read_length = max_read_length,
preemptible_tries = preemptible_tries
}
# This is a temporary task to handle "joint calling" until Mutect2 can produce a GVCF.
# This proivdes coverage at each base so low coverage sites can be considered ./. rather than 0/0.
call CoverageAtEveryBase {
input:
input_bam_regular_ref = AlignAndCall.mt_aligned_bam,
input_bam_regular_ref_index = AlignAndCall.mt_aligned_bai,
input_bam_shifted_ref = AlignAndCall.mt_aligned_shifted_bam,
input_bam_shifted_ref_index = AlignAndCall.mt_aligned_shifted_bai,
shift_back_chain = shift_back_chain,
control_region_shifted_reference_interval_list = control_region_shifted_reference_interval_list,
non_control_region_interval_list = non_control_region_interval_list,
ref_fasta = mt_fasta,
ref_fasta_index = mt_fasta_index,
ref_dict = mt_dict,
shifted_ref_fasta = mt_shifted_fasta,
shifted_ref_fasta_index = mt_shifted_fasta_index,
shifted_ref_dict = mt_shifted_dict
}
output {
File subset_bam = SubsetBamToChrM.output_bam
File subset_bai = SubsetBamToChrM.output_bai
File mt_aligned_bam = AlignAndCall.mt_aligned_bam
File mt_aligned_bai = AlignAndCall.mt_aligned_bai
File out_vcf = AlignAndCall.out_vcf
File out_vcf_index = AlignAndCall.out_vcf_index
File duplicate_metrics = AlignAndCall.duplicate_metrics
File coverage_metrics = AlignAndCall.coverage_metrics
File theoretical_sensitivity_metrics = AlignAndCall.theoretical_sensitivity_metrics
File contamination_metrics = AlignAndCall.contamination_metrics
File base_level_coverage_metrics = CoverageAtEveryBase.table
Int mean_coverage = AlignAndCall.mean_coverage
String major_haplogroup = AlignAndCall.major_haplogroup
Float contamination = AlignAndCall.contamination
}
}
task SubsetBamToChrM {
input {
File input_bam
File input_bai
String contig_name
String basename = basename(basename(input_bam, ".cram"), ".bam")
File? ref_fasta
File? ref_fasta_index
File? ref_dict
File? gatk_override
# runtime
Int? preemptible_tries
}
Float ref_size = if defined(ref_fasta) then size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") else 0
Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20
meta {
description: "Subsets a whole genome bam to just Mitochondria reads"
}
parameter_meta {
ref_fasta: "Reference is only required for cram input. If it is provided ref_fasta_index and ref_dict are also required."
input_bam: {
localization_optional: true
}
input_bai: {
localization_optional: true
}
}
command <<<
set -e
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}
gatk PrintReads \
~{"-R " + ref_fasta} \
-L ~{contig_name} \
--read-filter MateOnSameContigOrNoMappedMateReadFilter \
--read-filter MateUnmappedAndUnmappedReadFilter \
-I ~{input_bam} \
-O ~{basename}.bam
>>>
runtime {
memory: "3 GB"
disks: "local-disk " + disk_size + " HDD"
docker: "us.gcr.io/broad-gatk/gatk:4.1.1.0"
preemptible: select_first([preemptible_tries, 5])
}
output {
File output_bam = "~{basename}.bam"
File output_bai = "~{basename}.bai"
}
}
task RevertSam {
input {
File input_bam
String basename = basename(input_bam, ".bam")
# runtime
Int? preemptible_tries
}
Int disk_size = ceil(size(input_bam, "GB") * 2.5) + 20
meta {
description: "Removes alignment information while retaining recalibrated base qualities and original alignment tags"
}
parameter_meta {
input_bam: "aligned bam"
}
command {
java -Xmx1000m -jar /usr/gitc/picard.jar \
RevertSam \
INPUT=~{input_bam} \
OUTPUT_BY_READGROUP=false \
OUTPUT=~{basename}.bam \
VALIDATION_STRINGENCY=LENIENT \
ATTRIBUTE_TO_CLEAR=FT \
ATTRIBUTE_TO_CLEAR=CO \
SORT_ORDER=queryname \
RESTORE_ORIGINAL_QUALITIES=false
}
runtime {
disks: "local-disk " + disk_size + " HDD"
memory: "2 GB"
docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.2-1552931386"
preemptible: select_first([preemptible_tries, 5])
}
output {
File unmapped_bam = "~{basename}.bam"
}
}
task CoverageAtEveryBase {
input {
File input_bam_regular_ref
File input_bam_regular_ref_index
File input_bam_shifted_ref
File input_bam_shifted_ref_index
File shift_back_chain
File control_region_shifted_reference_interval_list
File non_control_region_interval_list
File ref_fasta
File ref_fasta_index
File ref_dict
File shifted_ref_fasta
File shifted_ref_fasta_index
File shifted_ref_dict
Int? preemptible_tries
}
Int disk_size = ceil(size(input_bam_regular_ref, "GB") + size(input_bam_shifted_ref, "GB") + size(ref_fasta, "GB") * 2) + 20
meta {
description: "Remove this hack once there's a GVCF solution."
}
command <<<
set -e
java -jar /usr/gitc/picard.jar CollectHsMetrics \
I=~{input_bam_regular_ref} \
R=~{ref_fasta} \
PER_BASE_COVERAGE=non_control_region.tsv \
O=non_control_region.metrics \
TI=~{non_control_region_interval_list} \
BI=~{non_control_region_interval_list} \
COVMAX=20000 \
SAMPLE_SIZE=1
java -jar /usr/gitc/picard.jar CollectHsMetrics \
I=~{input_bam_shifted_ref} \
R=~{shifted_ref_fasta} \
PER_BASE_COVERAGE=control_region_shifted.tsv \
O=control_region_shifted.metrics \
TI=~{control_region_shifted_reference_interval_list} \
BI=~{control_region_shifted_reference_interval_list} \
COVMAX=20000 \
SAMPLE_SIZE=1
R --vanilla <<CODE
shift_back = function(x) {
if (x < 8570) {
return(x + 8000)
} else {
return (x - 8569)
}
}
control_region_shifted = read.table("control_region_shifted.tsv", header=T)
shifted_back = sapply(control_region_shifted[,"pos"], shift_back)
control_region_shifted[,"pos"] = shifted_back
beginning = subset(control_region_shifted, control_region_shifted[,'pos']<8000)
end = subset(control_region_shifted, control_region_shifted[,'pos']>8000)
non_control_region = read.table("non_control_region.tsv", header=T)
combined_table = rbind(beginning, non_control_region, end)
write.table(combined_table, "per_base_coverage.tsv", row.names=F, col.names=T, quote=F, sep="\t")
CODE
>>>
runtime {
disks: "local-disk " + disk_size + " HDD"
memory: "1200 MB"
docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.2-1552931386"
preemptible: select_first([preemptible_tries, 5])
}
output {
File table = "per_base_coverage.tsv"
}
}