-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmetagenome-asm
executable file
·287 lines (266 loc) · 6.63 KB
/
metagenome-asm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
#!/usr/bin/env bash
set -e
#Prints a help message
function print_help() {
echo "Usage: $0 [options] -i rawReads -g genomesize -o out -p sequencingPlatform -m pipeline"
echo ""
echo "Enhancing long read based strain-aware metagenome assembly"
echo ""
echo "Author: Xiao Luo"
echo "Date: Mar 2022"
echo ""
echo " Input:"
echo " rawReads: input long reads"
echo " genomesize: estimated genome size (e.g. 3g/30m/300k)"
echo " out: directory where to output the results"
echo " sequencingPlatform: long read sequencing platform: PacBio (-p pb) or Oxford Nanopore (-p ont)"
echo " pipeline: which pipeline to run: MetaBooster (-m MetaBooster) or HiFiBooster (-m HiFiBooster)"
echo ""
echo " Options for setting VeChat:"
echo " --split BOOL split target sequences into chunks (default: False)"
echo " --split-size INT split target sequences into chunks of desired size in lines, only valid when using --split (default: 1000000)"
echo " --scrub BOOL scrub chimeric reads (default: False)"
echo " --base BOOL perform base level alignment when computing read overlaps in the first cycle of VeChat (default: False)"
echo " --min-identity-cns FLOAT minimum sequence identity between read overlaps in the consensus round (default: 0.99)"
echo " --threads INT, -t INT: number of processes to run in parallel (default: 8)"
echo " --help, -h: print this help message"
exit 1
}
#Set options to default values
raw_reads=""
genomesize=""
threads=8
outdir="out"
platform="pb"
pipeline="MetaBooster"
split="False"
splitsize=1000000
scrub="False"
base="False"
min_identity_cns=0.99
#Print help if no argument specified
if [[ "$1" == "" ]]; then
print_help
fi
#Options handling
while [[ "$1" != "" ]]; do
case "$1" in
"--help" | "-h")
print_help
;;
"-i")
case "$2" in
"")
echo "Error: $1 expects an argument"
exit 1
;;
*)
raw_reads="$2"
shift 2
;;
esac
;;
"-g")
case "$2" in
"")
echo "Error: $1 expects an argument"
exit 1
;;
*)
genomesize="$2"
shift 2
;;
esac
;;
"-o")
case "$2" in
"")
echo "Error: $1 expects an argument"
exit 1
;;
*)
outdir="$2"
shift 2
;;
esac
;;
"-p")
case "$2" in
"")
echo "Error: $1 expects an argument"
exit 1
;;
*) if [[ "$2" == "pb" ]]; then
platform="pb"
shift 2
elif [[ "$2" == "ont" ]]; then
platform="ont"
shift 2
else
echo "Error: $1 must be either pb or ont"
exit 1
fi ;;
esac
;;
"-m")
case "$2" in
"")
echo "Error: $1 expects an argument"
exit 1
;;
*) if [[ "$2" == "MetaBooster" ]]; then
pipeline="MetaBooster"
shift 2
elif [[ "$2" == "HiFiBooster" ]]; then
pipeline="HiFiBooster"
shift 2
else
echo "Error: $1 must be either MetaBooster or HiFiBooster"
exit 1
fi ;;
esac
;;
"--split")
case "$2" in
"")
echo "Error: $1 expects an argument"
exit 1
;;
*)
split="$2"
shift 2
;;
esac
;;
"--split-size")
case "$2" in
"")
echo "Error: $1 expects an argument"
exit 1
;;
*)
splitsize="$2"
shift 2
;;
esac
;;
"--base")
case "$2" in
"")
echo "Error: $1 expects an argument"
exit 1
;;
*)
base="$2"
shift 2
;;
esac
;;
"--scrub")
case "$2" in
"")
echo "Error: $1 expects an argument"
exit 1
;;
*)
scrub="$2"
shift 2
;;
esac
;;
"--min-identity-cns")
case "$2" in
"")
echo "Error: $1 expects an argument"
exit 1
;;
*)
min_identity_cns="$2"
shift 2
;;
esac
;;
"--threads" | "-t")
case "$2" in
"")
echo "Error: $1 expects an argument"
exit 1
;;
*)
threads="$2"
shift 2
;;
esac
;;
#
--)
shift
break
;;
*)
echo "Error: invalid option \"$1\""
exit 1
;;
esac
done
#Exit if no input or no output files have been specified
if [[ $raw_reads == "" ]]; then
echo "Error: -i must be specified"
exit 1
fi
if [[ $genomesize == "" ]]; then
echo "Error: -g must be specified"
exit 1
fi
basepath=$(dirname $0)
if [ ! -d $outdir ]; then
mkdir $outdir
else
echo Directory \'$outdir\' 'already exists, please use a new one, exiting...'
exit
fi
# Run VeChat for error correction
vechat_cmd="vechat "$raw_reads" -t "$threads" -u --platform "$platform" -o "$outdir"/reads.vechat_corrected.fa --min-identity-cns "$min_identity_cns
if [[ $split == "True" ]];then
vechat_cmd=$vechat_cmd" --split --split-size "$split-size
fi
if [[ $scrub == "True" ]];then
vechat_cmd=$vechat_cmd" --scrub "
fi
if [[ $base == "True" ]];then
vechat_cmd=$vechat_cmd" --base "
fi
eval $vechat_cmd
if [[ $platform == "pb" ]];then
# Run Canu for error correction
rm -rf $outdir/canu
canu -p out -d $outdir/canu useGrid=false rawErrorRate=0.2 genomeSize=$genomesize -pacbio $raw_reads
zcat $outdir/canu/out.trimmedReads.fasta.gz >$outdir/xx
cat $outdir/reads.vechat_corrected.fa $outdir/xx >$outdir/reads.merged.fa
rm $outdir/xx
# Run (Hi)Canu's assembly module
combined_read=$outdir/reads.merged.fa
rm -rf $outdir/combined
if [[ $pipeline == "MetaBooster" ]];then
canu -p out -d $outdir/combined useGrid=false genomeSize=$genomesize -corrected -pacbio $combined_read
else #HiFiBooster
canu -p out -d $outdir/combined useGrid=false genomeSize=$genomesize -corrected -pacbio-hifi $combined_read
fi
else #ONT
# Run Canu for error correction
rm -rf $outdir/canu
canu -p out -d $outdir/canu useGrid=false rawErrorRate=0.2 genomeSize=$genomesize -nanopore $raw_reads
zcat $outdir/canu/out.trimmedReads.fasta.gz >$outdir/xx
cat $outdir/reads.vechat_corrected.fa $outdir/xx >$outdir/reads.merged.fa
rm $outdir/xx
# Run (Hi)Canu's assembly module
combined_read=$outdir/reads.merged.fa
rm -rf $outdir/combined
if [[ $pipeline == "MetaBooster" ]];then
canu -p out -d $outdir/combined useGrid=false genomeSize=$genomesize -corrected -nanopore $combined_read
else #HiFiBooster
canu -p out -d $outdir/combined useGrid=false genomeSize=$genomesize -corrected -pacbio-hifi $combined_read
fi
fi
cp $outdir/combined/out.contigs.fasta $outdir/assembly.fa
echo 'Program finished. Please see this file for output contigs: '$outdir"/assembly.fa"