forked from ekg/gatk-swe
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pipeline.sh
executable file
·347 lines (269 loc) · 10.4 KB
/
pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
#!/bin/bash
set -e -x
set -o pipefail
. ./general_settings.sh
. $1
#INPUT_FASTQ=paired[s3://gapp-west/test@clusterk.com/sample_exome/025_Bioplanet_GCAT_30x/gcat_set_025_1.fastq.gz,s3://gapp-west/test@clusterk.com/sample_exome/025_Bioplanet_GCAT_30x/gcat_set_025_2.fastq.gz]
export GATK_REFERENCE=s3://gapp-east/gatk-reference/
export ANALYSIS=$K_ANALYSIS
export PATH=$PATH:./bin
export SHELL=/bin/bash
[ "$SWE_ENGINE" != "" ] || export SWE_ENGINE=clusterk
[ "$CHROMOSOMES" != "" ] || export CHROMOSOMES="chr22 chr21 chr20 chr19 chr18 chr17 chr16 chr15 chr14 chr13 chr12 chr11 chr10 chr9 chr8 chr7 chr6 chr5 chr4 chr3 chr2 chr1 chrY chrX"
#export SWE_DEV_MODE=1 # cache successfull task IDs
GENOME_FAI=./bin/ucsc.hg19.fasta.fai
[ -e $GENOME_FAI ]
export SWE_S3_STORAGE=s3://gapp-east-temp
export common_params=" -p 10 -u ./swe -u pre_script.sh -u bin.tar.gz -t ANALYSIS=$K_ANALYSIS -c auto:ANALYSIS,STAGE -e auto:ANALYSIS,STAGE -m 15000 -dm 40000 -de auto:ANALYSIS,STAGE -th auto:ANALYSIS,STAGE "
# create a new job
job_id=$(kjob add -n $NAME-`date "+%Y-%m-%d-%H:%M:%S"`)
common_params="$common_params -j $job_id"
#common_params="$common_params -j 1"
#if a GATK 3.0 jar is specified - use it. Otherwise fall back to GATK_Lita
#upload GATK_JAR
if [ "$GATK_FULL_JAR" != "" ]
then
GATK_FULL_JAR=$(./swe store ./GenomeAnalysisTK.jar)
fi
if [ "$GATK_LITE_JAR" != "" ]
then
# if HTTP link for 3.0 is not provided fall back on free GATK_Lite
GATK_LITE_JAR=$(./swe store ./GenomeAnalysisTKLite.jar)
fi
[ "$GATK_LITE_JAR" != "" ] #GATK_JAR must be defined
[ -e bin.tar.gz ] || tar czvf bin.tar.gz ./bin
NAME_PREFIX="$NAME:$K_ANALYSIS";
#####################################################################################################
# Process input files:
# 1. Split each input file into chunks of 1GB
# 2. Launch alignment job per each chunk
# 3. Each alignment will return one sorted file per chromosome, which will later be combined.
#
# supported input formats: paired[s3_path_for_read1,s3_path_for_read2]
# local[local_path_for_read1,local_path_for_read2]
#
#
#approximate size of input splits in bytes. 500MB
input_split_size=500000000
# also, you can just use a number of line
#input_split_lines=8000000
#list of alignment job IDs
align_job_ids=""
for input in $INPUT_FASTQ
do
echo $input
#check input string format
#compute number of alignment splits. one split per GB of input data.
if [[ $input =~ paired\[(.*),(.*)\] ]] ;
then
file1=${BASH_REMATCH[1]}
file2=${BASH_REMATCH[2]}
file1_size=$(es3 ls $file1 | head -n 1| cut -f 2)
elif [[ $input =~ local\[(.*),(.*)\] ]] ;
then
file1=$(./swe store ${BASH_REMATCH[1]})
file2=$(./swe store ${BASH_REMATCH[2]})
file1_size=$(stat -c%s ${BASH_REMATCH[1]})
else
echo Not implemented
false
fi
splits=$[$file1_size/$input_split_size+1]
echo Processing $input, will split it in $splits chunks
# input.sh: accepts input fastq, splits in N chunks, saves them as N.fastq.gz
split_job_id=$(ksub $common_params \
-u split_fastq/split.sh \
-u split_fastq/split_input_fastq.pl \
-t NAME=$NAME_PREFIX:split -t STAGE=split \
-v splits=$splits \
-v input1=$file1 \
-v input2=$file2 \
-c 16 \
--wrap="bash split.sh")
# count the splits we've made
#./swe wait $split_job_id
#cat $(./swe fetch $split_job_id:splits.txt)
#align each split, reference them via $split_job_id
for split in $(seq 1 $splits)
do
# align.sh: accepts input split file, produces alignment, split by chromosome
align_job_id=$(ksub $common_params \
-u align/align.sh \
-d $split_job_id \
-t NAME=$NAME_PREFIX:align -t STAGE=align\
-v sample_id=SAMPLE \
-v input=$split_job_id:$split.fastq.gz \
--wrap="bash align.sh")
align_job_ids="$align_job_ids $align_job_id"
done
done
########### GATK stage
# 1. combine chromosome files from each alignment job into single BAM file per chromosome
# 2. Use one of the chromosomes for base quality recalibration (chr22)
# 3. For each chromosome, run split_chr to find safe spot where chromosome can be broken
# 4. For each resulting interval run GATK HaplotypeCaller and the rest of Best Practices GATK pipeline
#
#
# number of splits per chromosomes
region_splits=10
#combine per chromosomes
for chr in $CHROMOSOMES
do
#get chromosome size and compute optimal number of splits
chr_size=$(grep "^$chr " $GENOME_FAI |cut -f 2)
region_splits=$[$chr_size/5000000]
[ "$region_splits" != "0" ]
#create comma separated list of alignment jobs
align_job_list=$(echo $align_job_ids |tr " " ",")
#create list of input alignment files for current chromosome
input_array=""
for align_job in $align_job_ids
do
input_array="$input_array $align_job:$chr.bam"
done
#submit comine jobs, and pass list of alignent files as input, and all alignment jobs as prerequsite
#combine.sh: accepts a list of aligned bam files, produced combined file for a given chromsome
# output: $combine_job_id:$chr.bam
combine_job_id=$(ksub $common_params \
-d $align_job_list \
-v chr=$chr \
-v input="$input_array" \
-u combine/combine.sh \
-u MarkDuplicates.jar \
-c 8 \
-t NAME=$NAME_PREFIX:combine:$chr -t STAGE=combine \
--wrap="bash combine.sh" )
#for one of the chromosomes submit BQSR job to produce Base Quality Recalibration files
#if [ "$chr" == "$BQSR_CHR" ]
#if [ false ]q
#then
#bqsr.sh: runs Base Quality recalibration on chr22
# output is $bqsr_job_id:bqsr.grp
# bqsr_job_id=$(ksub $common_params \
# -d $combine_job_id \
# -v input=$combine_job_id:$chr.bam \
# -t NAME=$NAME_PREFIX:bqsr:$chr -t STAGE=bqsr \
# -u bqsr/bqsr.sh \
# -v chr=$chr \
# -v gatk_jar=$GATK_JAR \
# --wrap="bash bqsr.sh")
#
# fi
# for each combined chromosomes, run split_chr.sh to obtain list of
#chr_split.sh: finds genomic locations where it is safe to split a chromosomes
# returns list of bam files: split.$chr.$split_id.bam
chr_split_id=$(ksub $common_params \
-d $combine_job_id \
-v input=$combine_job_id:$chr.bam \
-v splits=$region_splits \
-v chr=$chr \
-t NAME=$NAME_PREFIX:chr_split:$chr -t STAGE=chr_split \
-u split_chr/split_chr.sh \
-c 8 \
-u split_chr/advanced_splitter.pl \
-u split_chr/breakpoints2intervals.pl \
-u split_chr/equally_spaced_intervals.pl \
--wrap="bash split_chr.sh")
#for each chromosome split, start an analysis job from each caller
for split_id in $(seq 1 $region_splits)
do
#gatk.sh runs gatk on a sub-v nterval and applies BQSR
# output: $gatk_job_id:raw.vcf
# -d $chr_split_id,$bqsr_job_id \
gatk_ug_job_id=$(ksub $common_params \
-d $chr_split_id \
-v input=$chr_split_id:$split_id.bam \
-v interval=$chr_split_id:$split_id.interval \
-t NAME=$NAME_PREFIX:gatk-ug:$chr:$split_id -t STAGE=gatk-ug \
-u gatk-ug/gatk-ug.sh \
-v gatk_jar=$GATK_LITE_JAR \
--wrap="bash gatk-ug.sh")
gatk_hc_job_id=$(ksub $common_params \
-d $chr_split_id \
-v input=$chr_split_id:$split_id.bam \
-v interval=$chr_split_id:$split_id.interval \
-t NAME=$NAME_PREFIX:gatk-hc:$chr:$split_id -t STAGE=gatk-hc \
-u gatk-hc/gatk-hc.sh \
-v gatk_jar=$GATK_FULL_JAR \
--wrap="bash gatk-hc.sh")
freebayes_job_id=$(ksub $common_params \
-d $chr_split_id \
-v input=$chr_split_id:$split_id.bam \
-v interval=$chr_split_id:$split_id.interval \
-t NAME=$NAME_PREFIX:freebayes:$chr:$split_id -t STAGE=freebayes \
-u freebayes/freebayes.sh \
--wrap="bash freebayes.sh")
platypus_job_id=$(ksub $common_params \
-d $chr_split_id \
-v input=$chr_split_id:$split_id.bam \
-v interval=$chr_split_id:$split_id.interval \
-t NAME=$NAME_PREFIX:platypus:$chr:$split_id -t STAGE=platypus \
-u platypus/platypus.sh \
--wrap="bash platypus.sh")
#[ "$swe_wait" == "" ] || ./swe wait $caller_job_id
caller_job_ids="$caller_job_ids $gatk_ug_job_id $gatk_hc_job_id $freebayes_job_id $platypus_job_id"
done
done
# collect all caller job ids, and create comma separated list for dependendencies (-d)
caller_job_list=$(echo $caller_job_ids |tr " " ",")
input_array=""
for caller_job in $caller_job_ids
do
input_array="$input_array $caller_job:raw.vcf"
done
#submit a job that combines all sub-region vcf files, into one sorted VCF
#combine_vcf.sh concatenate sub-region VCFs into final VCF
combine_vcf_job_id=$(ksub $common_params \
-d $caller_job_list \
-v input="$input_array" \
-t NAME=$NAME_PREFIX:combine_vcf -t STAGE=combine_vcf \
-u combine_vcf/combine_vcf.sh \
--wrap="bash combine_vcf.sh" )
for chr in $CHROMOSOMES
do
chr_size=$(grep "^$chr " $GENOME_FAI |cut -f 2)
region_splits=$[$chr_size/5000000]
[ "$region_splits" != "0" ]
#for each chromosome split, start an analysis job from each caller
for split_id in $(seq 1 $region_splits)
do
#gatk.sh runs gatk on a sub-v nterval and applies BQSR
# output: $gatk_job_id:raw.vcf
# -d $chr_split_id,$bqsr_job_id \
glia_job_id=$(ksub $common_params \
-d $combine_vcf_job_id \
-v input=$chr_split_id:$split_id.bam \
-v interval=$chr_split_id:$split_id.interval \
-v candidates=$combine_vcf_job_id:raw.vcf.gz \
-t NAME=$NAME_PREFIX:glia:$chr:$split_id -t STAGE=glia \
-u glia/glia.sh \
--wrap="bash glia.sh")
gl_job_ids="$gl_job_ids $glia_job_id"
done
done
gl_job_list=$(echo $gl_job_ids |tr " " ",")
input_array=""
for gl_job in $gl_job_ids
do
input_array="$input_array $gl_job:raw.vcf"
done
combine_final_vcf_job_id=$(ksub $common_params \
-d $gl_job_list \
-v input="$input_array" \
-t NAME=$NAME_PREFIX:combine_vcf -t STAGE=combine_vcf \
-u combine_vcf/combine_vcf.sh \
--wrap="bash combine_vcf.sh" )
#submit a job that run
#run variant quality recalibration
# output: $vqsr_job_id:recalibrated.filtered.vcf.gz
#vqsr_job_id=$(ksub $common_params \
# -d $combine_vcf_job_id \
# -u vqsr/vqsr.sh \
# -v gatk_jar=$GATK_JAR \
# -t NAME=$NAME_PREFIX:vqsr -t STAGE=vqsr \
# -v input=$combine_vcf_job_id:raw.vcf.gz \
# --wrap=" bash vqsr.sh")
#wait for all jobs to finish
kwait $combine_final_vcf_job_id
./swe fetch $combine_final_vcf_job_id:raw.vcf.gz
exit 0