-
Notifications
You must be signed in to change notification settings - Fork 1
/
ExmVC.0.StartVariantCalling.sh
executable file
·156 lines (133 loc) · 7.36 KB
/
ExmVC.0.StartVariantCalling.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/bin/bash
# This script takes a list of gVCF files generated by the HaplotypeCaller (filename must end ".list") and performs the multi-sample joint aggregation step.
# InpFil - (required) - List of gVCF files. List file name must end ".list"
# RefFil - (required) - shell file containing variables with locations of reference files, jar files, and resource directories; see list below for required variables
# TgtBed - (optional) - Exome capture kit targets bed file (must end .bed for GATK compatability) ; may be specified using a code corresponding to a variable in the RefFil giving the path to the target file- only required if calling pipeline
# VcfNam - (optional) - A name for the analysis - to be used for naming output files. Will be derived from input filename if not provided
# LogFil - (optional) - File for logging progress
# NumJobs - (optional) The number of jobs to split the variant calling over. Default is 20
# Flag - P - PipeLine - call the next step in the pipeline at the end of the job
# Flag - B - BadET - prevent GATK from phoning home
# Help - H - (flag) - get usage information
#list of required vairables in reference file:
# $REF - reference genome in fasta format - must have been indexed using 'bwa index ref.fa'
# $DBSNP - dbSNP vcf from GATK
# $EXOMPPLN - directory containing exome analysis pipeline scripts
# $GATK - GATK jar file
# $ETKEY - GATK key file for switching off the phone home feature, only needed if using the B flag
#list of required tools:
# java <http://www.oracle.com/technetwork/java/javase/overview/index.html>
# GATK <https://www.broadinstitute.org/gatk/> <https://www.broadinstitute.org/gatk/download>
## This file also requires exome.lib.sh - which contains various functions used throughout the Exome analysis scripts; this file should be in the same directory as this script
###############################################################
#set default arguments
usage="
ExmVC.0.StartVariantCalling.sh -i <InputFile> -r <reference_file> -t <targetfile> -n <OutputVCFName> -j <NumberOfParallelJobs> -l <logfile> -PXBH
-i (required) - List of gVCF files. List file name must end \".list\"
-r (required) - shell file containing variables with locations of reference files and resource directories
-t (required) - Exome capture kit targets or other genomic intervals bed file (must end .bed for GATK compatability)
-n (optional) - Analysis/output VCF name - will be derived from input filename if not provided; only used if calling pipeline
-j (optional) - Number of parallel jobs to split the analysis across; DEFAULT=20
-l (optional) - Log file
-P (flag) - Call next step of exome analysis pipeline after completion of script
-X (flag) - Do not run Variant Quality Score Recalibration - only if calling pipeline
-B (flag) - Prevent GATK from phoning home
-H (flag) - echo this message and exit
"
AllowMisencoded="false"
PipeLine="false"
NoRecal="false"
BadET="true"
NumJobs=20
while getopts i:r:t:n:l:j:PXBH opt; do
case "$opt" in
i) InpFil="$OPTARG";;
r) RefFil="$OPTARG";;
t) TgtBed="$OPTARG";;
n) VcfNam="$OPTARG";;
j) NumJobs="$OPTARG";;
l) LogFil="$OPTARG";;
P) PipeLine="true";;
X) NoRecal="true";;
B) BadET="true";;
H) echo "$usage"; exit;;
esac
done
#check all required paramaters present
if [[ ! -e "$InpFil" ]] || [[ ! -e "$RefFil" ]] || [[ -z "$TgtBed" ]]; then
echo "Missing/Incorrect required arguments"
echo "provided arguments: -i $InpFil -r $RefFil -t $TgtBed"
echo "usage: $usage"
exit
fi
#Call the RefFil to load variables
RefFil=`readlink -f $RefFil`
source $RefFil
#Load script library
source $EXOMPPLN/exome.lib.sh #library functions begin "func" #library functions begin "func"
#Local Variables
funcGetTargetFile #If the target file has been specified using a code, get the full path from the exported variable
if [[ -z "$VcfNam" ]];then VcfNam=`basename $InpFil`; VcfNam=${VcfNam/.list/}; fi # a name for the log file
LogFil=$VcfNam.VariantCalling.log
TmpLog=$VcfNam.VariantCalling.temp.log
BedDir=$VcfNam.TempTargetFiles #directory to contain the split target files
mkdir -p $BedDir
TgtNam=`basename $TgtBed` #The name of the original capture kit files
SpltFil=$BedDir/${TgtNam/.bed/} #base name for the split capture kit files
#Start Log File
ProcessName="Start joint calling of gVCFs with GATK GenotypeGVCFs" # Description of the script - used in log
funcWriteStartLog
#Split the Target intervals file into <<NumJobs>> seperate files
StepName="Split Capture Kit"
TarLen=`cat $TgtBed | wc -l` #get the number of lines in the Capture Kit file
SpltLen=$(( TarLen / NumJobs )) # get quotient of target file length and number of jobs --> number of lines per split file
## We will divide the Target file using split with SpltLen lines per a file. This will give us NumJobs files of SpltLen lines. If TarLen is not exactly divisible by NumJobs, there will be an additional file that contains the remainder of the lines; this will always be less than NumJobs lines. As is NumJobs is always going to be very small relative to SpltLen, this job is pointless. So we will append the final file to the preceding file.
StepCmd="split -l $SpltLen --numeric-suffixes=1 --additional-suffix=.bed $TgtBed $SpltFil"
funcRunStep
LastTgt=$SpltFil$NumJobs.bed #The Last target file to keep for Numjobs
FinlFil=$SpltFil$(( NumJobs + 1 )).bed #The small file at the end to append to LastTgt; may not exist
if [[ -e $FinlFil ]]; then
echo "Number of lines in jobs 1-"$(( NumJobs - 1 ))": "`cat $LastTgt | wc -l` >> $TmpLog
cat $LastTgt $FinlFil >> $LastTgt.temp
echo "Number of lines in job "$NumJobs": "`cat $LastTgt.temp | wc -l` >> $TmpLog
mv -v $LastTgt.temp $LastTgt >> $TmpLog
rm -v $FinlFil >> $TmpLog
fi
echo "-----------------------------------------------------------------------" >> $TmpLog
# Call the Joint variant calling step, one for each split target file and then wait until all are finished before calling the MergeVCF file.
StepName="Call Joint variant calling in "$NumJobs" jobs" >> $TmpLog
mkdir -p stdostde
callSplitVC (){
ArrNum=1
for SpltTgt in $BedDir/*; do
CallCmd="nohup $EXOMPPLN/ExmVC.1hc.GenotypeGVCFs.sh -i $InpFil -r $RefFil -t $SpltTgt -n $VcfNam -a $ArrNum -l $LogFil" ;
if [[ "$BadET" == "true" ]]; then CallCmd=$CallCmd" -B"; fi ;
CallCmd=$CallCmd" >stdostde/VariantCalling.$VcfNam.$ArrNum.o 2>stdostde/VariantCalling.$VcfNam.$ArrNum.e &" ;
echo $CallCmd >> $TmpLog ;
eval $CallCmd ;
ArrNum=$(( ArrNum + 1 )) ;
done
}
StepCmd=callSplitVC
funcRunStep
#Wait for all VC jobs to run
cat $TmpLog >> $LogFil
rm -f $TmpLog
wait
echo "Done all split Variant calling"`date` >> $TmpLog
VcfDir=$VcfNam.splitfiles
CMD="ls $VcfDir | grep vcf$ | wc -l"
echo $CMD >> $TmpLog
eval $CMD >> $TmpLog
echo "-----------------------------------------------------------------------" >> $TmpLog
rm -r $BedDir #remove the temporary target file directory
# Call Merge VCF
StepName="Call Merge VCFs" >> $TmpLog
StepCmd="nohup $EXOMPPLN/ExmVC.2.MergeVCF.sh -i $VcfDir -r $RefFil -l $LogFil -C"
if [[ "$PipeLine" == "true" ]]; then StepCmd=$StepCmd" -P"; fi
if [[ "$BadET" == "true" ]]; then StepCmd=$StepCmd" -B"; fi
if [[ "$NoRecal" == "true" ]]; then StepCmd=$StepCmd" -X"; fi
StepCmd=$StepCmd" >stdostde/MergeVCFs.$VcfNam.o 2>stdostde/MergeVCFs.$VcfNam.e &"
funcRunStep
#End Log
funcWriteEndLog