-
Notifications
You must be signed in to change notification settings - Fork 1
/
ExmAdHoc.8.Alignment_Finisher.sh
executable file
·138 lines (119 loc) · 6.22 KB
/
ExmAdHoc.8.Alignment_Finisher.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/bin/bash
#$ -cwd -l mem=16G,time=6:: -N FinishBWA
# This script can be used to run the final steps of the bam to bam mapping scripts (ExmAln.1b) when they have completed the bwa-mem mapping but run out of time in the latter steps. It should be run within the alignment working directory, with the same arguments as the original mapping script and will select the appropriate steps to run based on the existing files in the directory. It will automatically trigger the pipeline if requestd.
# Usage notes:
# Please see ExmAln.1b.ReAlign_Bam_with_BWAmem.sh for full details
# The script should be run from within the mapping directory
# InpFil - (required) - Path to the aligned Bam file to be aligned
# RefFil - (required) - shell file containing variables with locations of reference files and resource directories; see list below for required variables
# LogFil - (optional) - File for logging progress
# TgtBed - (optional) - Exome capture kit targets bed file (must end .bed for GATK compatability) ; may be specified using a code corresponding to a variable in the RefFil giving the path to the target file- only required if calling pipeline
# PipeLine - P -(flag) - will start the GATK realign and recalibrate pipeline using the files generated by this script
# Flag - F - Fix mis-encoded base quality scores - see GATK manual. GATK will subtract 31 from all quality scores; used to fix encoding in some datasets (especially older Illumina ones) which starts at Q64 (see https://en.wikipedia.org/wiki/FASTQ_format#Encoding); This flag is only possibly necessary if calling the pipeline.
# Help - H - (flag) - get usage information
#list of variables required in reference file:
# $REF - reference genome in fasta format - must have been indexed using 'bwa index ref.fa'
# $EXOMPPLN - directory containing exome analysis pipeline scripts,
#list of required tools:
# samtools <http://samtools.sourceforge.net/> <http://sourceforge.net/projects/samtools/files/>
# bwa mem <http://bio-bwa.sourceforge.net/> <http://sourceforge.net/projects/bio-bwa/files/>
# java <http://www.oracle.com/technetwork/java/javase/overview/index.html>
# picard <http://picard.sourceforge.net/> <http://sourceforge.net/projects/picard/files/>
## This file also requires exome.lib.sh - which contains various functions used throughout the Exome analysis scripts; this file should be in the same directory as this script
## Note that htscmd bam2fq will generate a warning:
## [W::bam_hdr_read] EOF marker is absent. The input is probably truncated.
## This is not a problem, it is just a bug related to piping the stdin as the input, it can be ignored
###############################################################
#set default arguments
usage="
ExmAdHoc.9.Alignment_Finisher.sh -i <InputFile> -r <reference_file> -t <target intervals file> -l <logfile> -PH
-i (optional) - Aligned bam file
-r (optional) - shell file containing variables with locations of reference files and resource directories
-l (optional) - Log file
-t (optional) - Exome capture kit targets or other genomic intervals bed file (must end .bed for GATK compatability)
-P (flag) - Initiate exome analysis pipeline after completion of script
-F (flag) - Fix mis-encoded base quality scores - see GATK manual
-H (flag) - echo this message and exit
"
PipeLine="false"
FixMisencoded="false"
#get arguments
while getopts i:r:l:t:PFH opt; do
case "$opt" in
i) InpFil="$OPTARG";;
r) RefFil="$OPTARG";;
l) LogFil="$OPTARG";;
t) TgtBed="$OPTARG";;
P) PipeLine="true";;
F) FixMisencoded="true";;
H) echo "$usage"; exit;;
esac
done
#check all required paramaters present
if [[ -z "$InpFil" ]]; then InpFil=$(ls | grep -m 1 bam$); fi
if [[ -z "$RefFil" ]]; then RefFil=$(grep -m 1 "Pipeline Reference File" *log | cut -d" " -f 5); fi
if [[ -z "$TgtBed" ]]; then TgtBed=$(grep -m 1 "Target Intervals File" *log | cut -d" " -f 5); fi
if [[ ! -e "$InpFil" ]] || [[ ! -e "$RefFil" ]]; then
echo "Missing/Incorrect required arguments"
echo "$usage"
echo "Provided arguments:"
echo " InpFil: "$InpFil" RefFil: "$RefFil" TgtBed: "$TgtBed
exit
fi
#Call the RefFil to load variables
RefFil=`readlink -f $RefFil`
source $RefFil
#Load script library
source $EXOMPPLN/exome.lib.sh #library functions begin "func"
#set local variables
BamNam=`echo $InpFil | sed s/.bwamem.*//`
echo " InpFil: "$BamNam" RefFil: "$RefFil" TgtBed: "$TgtBed
if [[ -z "$LogFil" ]]; then LogFil=$BamNam.BbB.log; fi # a name for the log file
AlnFil=$BamNam.bwamem.bam #filename for bwa-mem aligned file
SrtFil=$BamNam.bwamem.sorted.bam #output file for sorted bam
DdpFil=$BamNam.bwamem.mkdup.bam #output file with PCR duplicates marked
FlgStat=$BamNam.bwamem.flagstat #output file for bam flag stats
IdxStat=$BamNam.idxstats #output file for bam index stats
TmpLog=$BamNam.BwB.temp.log #temporary log file
TmpDir=$BamNam.BwB.tempdir; mkdir -p $TmpDir #temporary directory
#Sort the bam file by coordinate
if [[ -e $AlnFil ]]; then
echo "Start Sort"
StepName="Sort Bam using PICARD"
StepCmd="java -Xmx4G -Djava.io.tmpdir=$TmpDir -jar $PICARD/SortSam.jar
INPUT=$AlnFil
OUTPUT=$SrtFil
SORT_ORDER=coordinate
CREATE_INDEX=TRUE"
echo $StepCmd
funcRunStep
rm $AlnFil #removed the "Aligned bam"
fi
#Mark the duplicates
if [[ -e $SrtFil ]]; then
StepName="Mark PCR Duplicates using PICARD"
StepCmd="java -Xmx4G -Djava.io.tmpdir=$TmpDir -jar $PICARD/MarkDuplicates.jar
INPUT=$SrtFil
OUTPUT=$DdpFil
METRICS_FILE=$DdpFil.dup.metrics.txt
CREATE_INDEX=TRUE"
funcRunStep
rm $SrtFil ${SrtFil/bam/bai} #removed the "Sorted bam"
fi
#Get flagstat
StepName="Output flag stats using Samtools"
StepCmd="samtools flagstat $DdpFil > $FlgStat"
funcRunStep
#get index stats
StepName="Output idx stats using Samtools"
StepCmd="samtools idxstats $DdpFil > $IdxStat"
funcRunStep
#Call next steps of pipeline if requested
NextJob="Run Genotype VCF"
QsubCmd="qsub -o stdostde/ -e stdostde/ $EXOMPPLN/ExmAln.2.HaplotypeCaller_GVCFmode.sh -i $DdpFil -r $RefFil -t $TgtBed -l $LogFil -P -B"
funcPipeLine
NextJob="Get basic bam metrics"
QsubCmd="qsub -o stdostde/ -e stdostde/ $EXOMPPLN/ExmAln.3a.Bam_metrics.sh -i $DdpFil -r $RefFil -l $LogFil"
funcPipeLine
#End Log
funcWriteEndLog