-
Notifications
You must be signed in to change notification settings - Fork 0
/
SRFChainer
executable file
·253 lines (227 loc) · 8.36 KB
/
SRFChainer
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
#!/bin/bash
set -e
set -o pipefail
thisfolder=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) # https://stackoverflow.com/questions/59895/how-do-i-get-the-directory-where-a-bash-script-is-located-from-within-the-script
# executables' absolute paths/commands (make sure they work!)
graphaligner=$thisfolder/tools/GraphAligner/bin/GraphAligner
efglocate=$thisfolder/tools/efg-locate/efg-locate
chainxblockgraph=$thisfolder/tools/ChainX-block-graph/chainx-block-graph
# default params
workingfolder="."
threads=8 # threads
edgemincount=0 # semi-repeat-free seeds only
alternativechains=0
chainingguess="--initial-guess-coverage=0.5 --ramp-up-factor=1.5"
extendoptions="--max-cluster-extend 5 -b 10" # GraphAligner default extend options
discardoption="(substr(\$16,6) > 0.90) && ((\$4-\$3)*100/\$2 >= 50)" # using -c option, discard alignments with identity <= 90% or read coverage < 50%
# load balancing
locatework=(32 32 22 14 14 11 10 9 8 7 6)
chainwork=(8 25 32 32 32 32 32 32 32 32 32)
print_help()
{
echo "Pipeline to align long reads to indexable Elastic Founder Graphs based on chained semi-repeat-free seeds"
echo "usage: SRFChainer -g graph.gfa -f reads.fastq -a alignments.gaf"
echo " -h --help: show this screen"
echo " -g graph.gfa: semi-repeat-free EFG in xGFA format"
echo " -f reads.fastq: reads in FASTQ format"
echo " -a alignmentsout.gaf: output alignments in GAF format"
echo " -t threads: # of threads"
echo " -i IGNORECHARS : ignore the following characters for indexability/seed finding"
echo " -c : discard bad alignments (identity <= 0.9 or read coverage < 50%) and run GraphAligner on unaligned reads"
echo " -p : disable pipeline mode and save the seeds in working folder"
echo " -e : make GraphAligner consider each single seed (cluster) for extension"
echo " -m edgemincount: heuristic parameter for seed computation (see efg-locate)"
echo " -n altchains : heuristic parameter to chain an additional n times for each strand"
echo " -w path: working folder for output and temporary files"
}
load_balance()
{
m=$2
if (( $m >= ${#locatework[@]} ))
then
m=$(( ${#locatework[@]} - 1 ))
fi
threads1=$(( $1 * ${chainwork[$m]} / (${chainwork[$m]} + ${locatework[$m]})))
threads1=$(( threads1 >= $1 ? $1 - 1 : threads1 ))
threads1=$(( threads1 < 1 ? 1 : threads1 ))
echo "$(($1 - $threads1))"
}
# https://stackoverflow.com/questions/12022592/how-can-i-use-long-options-with-the-bash-getopts-builtin
for arg in "$@"; do
shift
case "$arg" in
'--help') set -- "$@" '-h' ;;
*) set -- "$@" "$arg" ;;
esac
done
while getopts "hepcg:f:a:t:w:m:i:n:" option; do
case $option in
h) # display help
print_help
exit;;
g) # graph
argg=true
graph="$OPTARG" ;;
f) # fastq reads
argf=true
reads="$OPTARG" ;;
a) # output
arga=true
alignmentsout="$OPTARG" ;;
t) # threads
argt=true
threads="$OPTARG" ;;
i) # ignorechars
argi=true
ignorechars="$OPTARG" ;;
w) # working folder
argw=true
workingfolder="$OPTARG" ;;
c) # discard short alignment heuristic flag
argc=true ;;
p) # disable pipeline mode flag
argp=true ;;
e) # extend all clusters
arge=true ;;
m) # edgemincount parameter
argm=true
edgemincount="$OPTARG" ;;
n) # alternative chains parameter
argn=true
alternativechains="$OPTARG" ;;
\?) # invalid option
echo "Error: Invalid option"
print_help
exit;;
esac
done
if [[ "$argg" != true ]] || [[ "$argf" != true ]] ; then
print_help
exit
fi
if [[ "$argp" != true ]] && [[ "$threads" -lt 2 ]] ; then
echo "Please pick at least 2 threads in pipeline mode!"
exit 1
fi
ignorecharsarg=""
if [[ "$argi" = true ]] ; then
ignorecharsarg="--ignore-chars=$ignorechars"
fi
if [[ "$arge" = true ]] ; then
extendoptions="--max-cluster-extend -1 --multimap-score-fraction 0.00 -b 10"
fi
# move to working folder
if [[ "$argw" = true ]] ; then
workingfolder="${workingfolder%/}"
else
workingfolder="."
fi
if [[ "$argp" = true ]] ; then
# find semi-repeat-free seeds
$efglocate --approximate --split-output-matches --reverse-complement --rename-reverse-complement --overwrite \
$ignorecharsarg \
--threads $threads \
--approximate-edge-match-min-count $edgemincount \
$graph \
<(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \
"$workingfolder/$(basename $reads)_srfchain_seeds.gaf"
# chainx-block-graph chain
$chainxblockgraph --semi-global --split-output-matches-graphaligner --overwrite $chainingguess \
--alternative-chains $alternativechains \
--threads $threads \
$graph \
"$workingfolder/$(basename $reads)_srfchain_seeds.gaf" \
"$workingfolder/$(basename $reads)_srfchain_chain.gaf"
# GraphAligner extend
$graphaligner $extendoptions \
-t $threads \
-g $graph \
-f $reads \
--realign "$workingfolder/$(basename $reads)_srfchain_chain.gaf" \
-a $alignmentsout
if [[ "$argc" = true ]] ; then
echo -n "Filtering alignments with identity <= 90% or read coverage < 50%..."
awk "{if ($discardoption) {print}}" "$alignmentsout" > "$workingfolder/filtered_alignments_$$.gaf"
mv "$workingfolder/filtered_alignments_$$.gaf" "$alignmentsout"
echo " done."
unalignedreads=$({ grep -v \
-f <(cut -f1 $alignmentsout | uniq) \
<(awk 'NR % 4 == 1' $reads | cut -d' ' -f1 | tr -d "@") || true; })
unalignedreadscount=$(echo -n "$unalignedreads" | wc -l)
if [[ "$unalignedreadscount" -gt "0" ]] ; then
grep -A 1 --no-group-separator \
-f <(echo "$unalignedreads") \
<(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \
> "$workingfolder/$(basename $reads)_unaligned_reads.fasta"
$graphaligner -x "vg" \
-t $threads \
-g $graph \
-f "$workingfolder/$(basename $reads)_unaligned_reads.fasta" \
-a "$workingfolder/$(basename $reads)_unaligned_reads.gaf"
cat "$workingfolder/$(basename $reads)_unaligned_reads.gaf" >> $alignmentsout
else
echo "There are no unaligned reads to realign"
fi
fi
else
# pipeline of above commands
if [[ "$argc" = true ]] ; then
# set up final GraphAligner call
fastapipe=$(mktemp -u --suffix ".fasta")
mkfifo -m 600 $fastapipe
trap '{ rm -f -- "$fastapipe"; }' EXIT
# TODO find a way to not store on disk the unaligned alignments and directly append them
$graphaligner -x "vg" \
-t $threads \
-g $graph \
-f $fastapipe \
-a "$workingfolder/unaligned_reads_$$.gaf" &
# give GraphAligner a dummy "file", see https://github.com/maickrau/GraphAligner/issues/105
set +eo pipefail ; { echo > $fastapipe & } ; set -eo pipefail
fi
efglocatethreads=$(load_balance $threads $edgemincount)
chainxthreads=$(( $threads - $efglocatethreads ))
echo "load balance: $efglocatethreads for locate, $chainxthreads for chaining"
$efglocate --approximate --split-output-matches --reverse-complement --rename-reverse-complement --overwrite \
--threads $efglocatethreads \
--approximate-edge-match-min-count $edgemincount \
$graph \
<(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \
/dev/stdout | \
$chainxblockgraph --semi-global --split-output-matches-graphaligner --overwrite $chainingguess \
--threads $chainxthreads \
--alternative-chains $alternativechains \
$graph \
/dev/stdin \
/dev/stdout | \
$graphaligner $extendoptions \
-t $threads \
-g $graph \
-f $reads \
--realign /dev/stdin \
-a $alignmentsout
if [[ "$argc" = true ]] ; then
echo -n "Filtering alignments with identity <= 90% or read coverage < 50%..."
awk "{if ($discardoption) {print}}" "$alignmentsout" > "$workingfolder/filtered_alignments_$$.gaf"
mv "$workingfolder/filtered_alignments_$$.gaf" "$alignmentsout"
echo " done."
unalignedreads=$({ grep -v \
-f <(cut -f1 $alignmentsout | uniq) \
<(awk 'NR % 4 == 1' $reads | cut -d' ' -f1 | tr -d "@") || true; })
unalignedreadscount=$(echo -n "$unalignedreads" | wc -l)
if [[ "$unalignedreadscount" -gt "0" ]] ; then
echo "There are the following unaligned reads:"
echo "$unalignedreads"
grep -A 1 --no-group-separator \
-f <(echo "$unalignedreads") \
<(awk 'NR % 4 == 1 || NR % 4 == 2' $reads | sed 's/^@/>/g' | cut -d' ' -f1) \
> $fastapipe
wait $(jobs -p)
else
echo "There are no unaligned reads to realign"
set +eo pipefail ; { kill $(jobs -p) & } ; set -eo pipefail
fi
cat "$workingfolder/unaligned_reads_$$.gaf" >> $alignmentsout
rm "$workingfolder/unaligned_reads_$$.gaf"
fi
fi