-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpi-tajima.sh
357 lines (353 loc) · 20.2 KB
/
pi-tajima.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
#!/bin/bash
set -e
{
usage="$(basename "$0") [-h] [-f <pi_tajima>] [-v <merged.vcf>]
This program will invoke R, several R libraries and vcfstats to process merged pi and Tajima's D values per bin.
Arguments:
-h show this help text
-f binned pi and Tajima's D values, in tabular format. As example: Europe.50.pi.D
-v merged VCF file with viral frequency values per variant. As example: merged.GISAID.AF.vcf"
options=':hf:v:'
while getopts $options option; do
case "$option" in
h) echo "$usage"; exit;;
f) f=$OPTARG;;
v) v=$OPTARG;;
:) printf "missing argument for -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;;
\?) printf "illegal option: -%s\n" "$OPTARG" >&2; echo "$usage" >&2; exit 1;;
esac
done
# mandatory arguments
if [ ! "$f" ] || [ ! "$v" ]; then
echo "arguments -f and -v must be provided"
echo "$usage" >&2; exit 1
fi
# Obtaining 95% confidence intervals in Tajima's D and parsing provided VCF file
echo "====> Obtaining 95% confidence intervals in Tajima's D and parsing provided VCF file"
echo ""
dir1=$(cd -P -- "$(dirname -- "$0")" && pwd -P)
echo "====> The working directory is the following: ${dir1}/"
echo ""
wget https://raw.githubusercontent.com/cfarkas/SARS-CoV-2-freebayes/master/confidence_interval.R
cp ${f} input.pi.D
grep -v "nan" input.pi.D > input.pi.D.clean && rm input.pi.D
mv input.pi.D.clean input.pi.D
Rscript confidence_interval.R input.pi.D && rm input.pi.D
sed -i 's/V1/bins/'g bins_2.5%_confidence.tab
sed -i 's/V1/bins/'g bins_97.5%_confidence.tab
awk '{print "NC_045512.2""\t"($1-50)"\t"$1}' bins_2.5%_confidence.tab > bins_2.5%_confidence.bed
awk '{print "NC_045512.2""\t"($1-50)"\t"$1}' bins_97.5%_confidence.tab > bins_97.5%_confidence.bed
tail -n +2 bins_2.5%_confidence.bed > bins_2.5%_conf_interval.bed && rm bins_2.5%_confidence.tab bins_2.5%_confidence.bed
tail -n +2 bins_97.5%_confidence.bed > bins_97.5%_conf_interval.bed && rm bins_97.5%_confidence.tab bins_97.5%_confidence.bed
mv bins_2.5%_conf_interval.bed bins_2.5%_confidence.bed
mv bins_97.5%_conf_interval.bed bins_97.5%_confidence.bed
vcftools --vcf ${v} --bed bins_2.5%_confidence.bed --out 2.5_CI_confidence --recode --keep-INFO-all
vcftools --vcf ${v} --bed bins_97.5%_confidence.bed --out 97.5_CI_confidence --recode --keep-INFO-all
echo ""
# making temporal directory and parsing vcf file by protein or feature
echo "====> Making temporal directory and parsing vcf file by protein or feature: outside 2.5% CI"
echo ""
mkdir vcfsplit-temporal-directory
cp 2.5_CI_confidence.recode.vcf ./vcfsplit-temporal-directory/
cp 97.5_CI_confidence.recode.vcf ./vcfsplit-temporal-directory/
cd vcfsplit-temporal-directory
# Working with bins outside 2.5% CI
echo "====> Working with bins outside 2.5% CI"
echo ""
grep "#" -v 2.5_CI_confidence.recode.vcf > variants.vcf
grep "#" 2.5_CI_confidence.recode.vcf > vcfheader
awk '{ if ($2>=1 && $2<=265) { print } }' variants.vcf > five_prime_utr.vcfsplit
awk '{ if ($2>=266 && $2<=805) { print } }' variants.vcf > leader_protein.vcfsplit
awk '{ if ($2>=806 && $2<=2719) { print } }' variants.vcf > nsp2.vcfsplit
awk '{ if ($2>=2720 && $2<=8554) { print } }' variants.vcf > nsp3.vcfsplit
awk '{ if ($2>=8555 && $2<=10054) { print } }' variants.vcf > nsp4.vcfsplit
awk '{ if ($2>=10055 && $2<=10972) { print } }' variants.vcf > 3C_like_proteinase.vcfsplit
awk '{ if ($2>=10973 && $2<=11842) { print } }' variants.vcf > nsp6.vcfsplit
awk '{ if ($2>=11843 && $2<=12091) { print } }' variants.vcf > nsp7.vcfsplit
awk '{ if ($2>=12092 && $2<=12685) { print } }' variants.vcf > nsp8.vcfsplit
awk '{ if ($2>=12686 && $2<=13024) { print } }' variants.vcf > nsp9.vcfsplit
awk '{ if ($2>=13025 && $2<=13441) { print } }' variants.vcf > nsp10.vcfsplit
awk '{ if ($2>=13442 && $2<=16236) { print } }' variants.vcf > RNA_dependent_RNA_polymerase.vcfsplit
awk '{ if ($2>=16237 && $2<=18039) { print } }' variants.vcf > helicase.vcfsplit
awk '{ if ($2>=18040 && $2<=19620) { print } }' variants.vcf > 3_prime-to-5_prime_exonuclease.vcfsplit
awk '{ if ($2>=19621 && $2<=20658) { print } }' variants.vcf > endoRNAse.vcfsplit
awk '{ if ($2>=20659 && $2<=21552) { print } }' variants.vcf > 2_prime-O-ribose_methyltransferase.vcfsplit
awk '{ if ($2>=21563 && $2<=25384) { print } }' variants.vcf > S.vcfsplit
awk '{ if ($2>=25393 && $2<=26220) { print } }' variants.vcf > ORF3a.vcfsplit
awk '{ if ($2>=26245 && $2<=26472) { print } }' variants.vcf > E.vcfsplit
awk '{ if ($2>=26523 && $2<=27191) { print } }' variants.vcf > M.vcfsplit
awk '{ if ($2>=27202 && $2<=27387) { print } }' variants.vcf > ORF6.vcfsplit
awk '{ if ($2>=27394 && $2<=27759) { print } }' variants.vcf > ORF7a.vcfsplit
awk '{ if ($2>=27756 && $2<=27887) { print } }' variants.vcf > ORF7b.vcfsplit
awk '{ if ($2>=27894 && $2<=28259) { print } }' variants.vcf > ORF8.vcfsplit
awk '{ if ($2>=28274 && $2<=29533) { print } }' variants.vcf > N.vcfsplit
awk '{ if ($2>=29558 && $2<=29674) { print } }' variants.vcf > ORF10.vcfsplit
awk '{ if ($2>=29675 && $2<=29903) { print } }' variants.vcf > three_prime_utr.vcfsplit
# generating stats
echo "====> Generating stats: outside 2.5% CI"
echo ""
echo "five_prime_utr" >> variants_per_feature.txt
echo "leader_protein" >> variants_per_feature.txt
echo "nsp2" >> variants_per_feature.txt
echo "nsp3" >> variants_per_feature.txt
echo "nsp4" >> variants_per_feature.txt
echo "3C_like_proteinase" >> variants_per_feature.txt
echo "nsp6" >> variants_per_feature.txt
echo "nsp7" >> variants_per_feature.txt
echo "nsp8" >> variants_per_feature.txt
echo "nsp9" >> variants_per_feature.txt
echo "nsp10" >> variants_per_feature.txt
echo "RNA_dependent_RNA_polymerase" >> variants_per_feature.txt
echo "helicase" >> variants_per_feature.txt
echo "3_prime-to-5_prime_exonuclease" >> variants_per_feature.txt
echo "endoRNAse" >> variants_per_feature.txt
echo "2_prime-O-ribose_methyltransferase" >> variants_per_feature.txt
echo "S" >> variants_per_feature.txt
echo "ORF3a" >> variants_per_feature.txt
echo "E" >> variants_per_feature.txt
echo "M" >> variants_per_feature.txt
echo "ORF6" >> variants_per_feature.txt
echo "ORF7a" >> variants_per_feature.txt
echo "ORF7b" >> variants_per_feature.txt
echo "ORF8" >> variants_per_feature.txt
echo "N" >> variants_per_feature.txt
echo "ORF10" >> variants_per_feature.txt
echo "three_prime_utr" >> variants_per_feature.txt
cat five_prime_utr.vcfsplit | wc -l >> variants_per_feature.txt
cat leader_protein.vcfsplit | wc -l >> variants_per_feature.txt
cat nsp2.vcfsplit | wc -l >> variants_per_feature.txt
cat nsp3.vcfsplit | wc -l >> variants_per_feature.txt
cat nsp4.vcfsplit | wc -l >> variants_per_feature.txt
cat 3C_like_proteinase.vcfsplit | wc -l >> variants_per_feature.txt
cat nsp6.vcfsplit | wc -l >> variants_per_feature.txt
cat nsp7.vcfsplit | wc -l >> variants_per_feature.txt
cat nsp8.vcfsplit | wc -l >> variants_per_feature.txt
cat nsp9.vcfsplit | wc -l >> variants_per_feature.txt
cat nsp10.vcfsplit | wc -l >> variants_per_feature.txt
cat RNA_dependent_RNA_polymerase.vcfsplit | wc -l >> variants_per_feature.txt
cat helicase.vcfsplit | wc -l >> variants_per_feature.txt
cat 3_prime-to-5_prime_exonuclease.vcfsplit | wc -l >> variants_per_feature.txt
cat endoRNAse.vcfsplit | wc -l >> variants_per_feature.txt
cat 2_prime-O-ribose_methyltransferase.vcfsplit | wc -l >> variants_per_feature.txt
cat S.vcfsplit | wc -l >> variants_per_feature.txt
cat ORF3a.vcfsplit | wc -l >> variants_per_feature.txt
cat E.vcfsplit | wc -l >> variants_per_feature.txt
cat M.vcfsplit | wc -l >> variants_per_feature.txt
cat ORF6.vcfsplit | wc -l >> variants_per_feature.txt
cat ORF7a.vcfsplit | wc -l >> variants_per_feature.txt
cat ORF7b.vcfsplit | wc -l >> variants_per_feature.txt
cat ORF8.vcfsplit | wc -l >> variants_per_feature.txt
cat N.vcfsplit | wc -l >> variants_per_feature.txt
cat ORF10.vcfsplit | wc -l >> variants_per_feature.txt
cat three_prime_utr.vcfsplit | wc -l >> variants_per_feature.txt
grep '[Aa-Zz]' variants_per_feature.txt > SARS-CoV-2_proteins
grep -v '[Aa-Zz]' variants_per_feature.txt > variants_per_sample
paste SARS-CoV-2_proteins variants_per_sample > variants_per_feature
rm SARS-CoV-2_proteins variants_per_sample variants_per_feature.txt
mv variants_per_feature variants_per_feature_2.5-CI.tabular
# Replacing chromosome name with gene names
echo "====> Replacing chromosome name with gene names : outside 2.5% CI"
echo ""
sed -i 's/NC_045512.2/five_prime_utr/'g five_prime_utr.vcfsplit
sed -i 's/NC_045512.2/leader_protein/'g leader_protein.vcfsplit
sed -i 's/NC_045512.2/nsp2/'g nsp2.vcfsplit
sed -i 's/NC_045512.2/nsp3/'g nsp3.vcfsplit
sed -i 's/NC_045512.2/nsp4/'g nsp4.vcfsplit
sed -i 's/NC_045512.2/3C_like_proteinase/'g 3C_like_proteinase.vcfsplit
sed -i 's/NC_045512.2/nsp6/'g nsp6.vcfsplit
sed -i 's/NC_045512.2/nsp7/'g nsp7.vcfsplit
sed -i 's/NC_045512.2/nsp8/'g nsp8.vcfsplit
sed -i 's/NC_045512.2/nsp9/'g nsp9.vcfsplit
sed -i 's/NC_045512.2/nsp10/'g nsp10.vcfsplit
sed -i 's/NC_045512.2/RNA_dependent_RNA_polymerase/'g RNA_dependent_RNA_polymerase.vcfsplit
sed -i 's/NC_045512.2/helicase/'g helicase.vcfsplit
sed -i 's/NC_045512.2/3_prime-to-5_prime_exonuclease/'g 3_prime-to-5_prime_exonuclease.vcfsplit
sed -i 's/NC_045512.2/endoRNAse/'g endoRNAse.vcfsplit
sed -i 's/NC_045512.2/2_prime-O-ribose_methyltransferase/'g 2_prime-O-ribose_methyltransferase.vcfsplit
sed -i 's/NC_045512.2/S/'g S.vcfsplit
sed -i 's/NC_045512.2/ORF3a/'g ORF3a.vcfsplit
sed -i 's/NC_045512.2/E/'g E.vcfsplit
sed -i 's/NC_045512.2/M/'g M.vcfsplit
sed -i 's/NC_045512.2/ORF6/'g ORF6.vcfsplit
sed -i 's/NC_045512.2/ORF7a/'g ORF7a.vcfsplit
sed -i 's/NC_045512.2/ORF7b/'g ORF7b.vcfsplit
sed -i 's/NC_045512.2/ORF8/'g ORF8.vcfsplit
sed -i 's/NC_045512.2/N/'g N.vcfsplit
sed -i 's/NC_045512.2/ORF10/'g ORF10.vcfsplit
sed -i 's/NC_045512.2/three_prime_utr/'g three_prime_utr.vcfsplit
# tweaking 2.5_CI_confidence.recode.vcf file replacing chromosome with proteins-feature names
echo "====> Tweaking 2.5_CI_confidence.recode.vcf file replacing chromosome with proteins-feature names"
echo ""
cat vcfheader five_prime_utr.vcfsplit leader_protein.vcfsplit nsp2.vcfsplit nsp3.vcfsplit nsp4.vcfsplit 3C_like_proteinase.vcfsplit nsp6.vcfsplit nsp7.vcfsplit nsp8.vcfsplit nsp9.vcfsplit nsp10.vcfsplit RNA_dependent_RNA_polymerase.vcfsplit helicase.vcfsplit 3_prime-to-5_prime_exonuclease.vcfsplit endoRNAse.vcfsplit 2_prime-O-ribose_methyltransferase.vcfsplit S.vcfsplit ORF3a.vcfsplit E.vcfsplit M.vcfsplit ORF6.vcfsplit ORF7a.vcfsplit ORF7b.vcfsplit ORF8.vcfsplit N.vcfsplit ORF10.vcfsplit three_prime_utr.vcfsplit > 2.5_CI_confidence.tweaked.vcf
rm *vcfsplit variants.vcf vcfheader
# Working with bins outside 97.5% CI
echo "====> Working with bins outside 97.5% CI"
echo ""
grep "#" -v 97.5_CI_confidence.recode.vcf > variants.vcf
grep "#" 97.5_CI_confidence.recode.vcf > vcfheader
awk '{ if ($2>=1 && $2<=265) { print } }' variants.vcf > five_prime_utr.vcfsplit
awk '{ if ($2>=266 && $2<=805) { print } }' variants.vcf > leader_protein.vcfsplit
awk '{ if ($2>=806 && $2<=2719) { print } }' variants.vcf > nsp2.vcfsplit
awk '{ if ($2>=2720 && $2<=8554) { print } }' variants.vcf > nsp3.vcfsplit
awk '{ if ($2>=8555 && $2<=10054) { print } }' variants.vcf > nsp4.vcfsplit
awk '{ if ($2>=10055 && $2<=10972) { print } }' variants.vcf > 3C_like_proteinase.vcfsplit
awk '{ if ($2>=10973 && $2<=11842) { print } }' variants.vcf > nsp6.vcfsplit
awk '{ if ($2>=11843 && $2<=12091) { print } }' variants.vcf > nsp7.vcfsplit
awk '{ if ($2>=12092 && $2<=12685) { print } }' variants.vcf > nsp8.vcfsplit
awk '{ if ($2>=12686 && $2<=13024) { print } }' variants.vcf > nsp9.vcfsplit
awk '{ if ($2>=13025 && $2<=13441) { print } }' variants.vcf > nsp10.vcfsplit
awk '{ if ($2>=13442 && $2<=16236) { print } }' variants.vcf > RNA_dependent_RNA_polymerase.vcfsplit
awk '{ if ($2>=16237 && $2<=18039) { print } }' variants.vcf > helicase.vcfsplit
awk '{ if ($2>=18040 && $2<=19620) { print } }' variants.vcf > 3_prime-to-5_prime_exonuclease.vcfsplit
awk '{ if ($2>=19621 && $2<=20658) { print } }' variants.vcf > endoRNAse.vcfsplit
awk '{ if ($2>=20659 && $2<=21552) { print } }' variants.vcf > 2_prime-O-ribose_methyltransferase.vcfsplit
awk '{ if ($2>=21563 && $2<=25384) { print } }' variants.vcf > S.vcfsplit
awk '{ if ($2>=25393 && $2<=26220) { print } }' variants.vcf > ORF3a.vcfsplit
awk '{ if ($2>=26245 && $2<=26472) { print } }' variants.vcf > E.vcfsplit
awk '{ if ($2>=26523 && $2<=27191) { print } }' variants.vcf > M.vcfsplit
awk '{ if ($2>=27202 && $2<=27387) { print } }' variants.vcf > ORF6.vcfsplit
awk '{ if ($2>=27394 && $2<=27759) { print } }' variants.vcf > ORF7a.vcfsplit
awk '{ if ($2>=27756 && $2<=27887) { print } }' variants.vcf > ORF7b.vcfsplit
awk '{ if ($2>=27894 && $2<=28259) { print } }' variants.vcf > ORF8.vcfsplit
awk '{ if ($2>=28274 && $2<=29533) { print } }' variants.vcf > N.vcfsplit
awk '{ if ($2>=29558 && $2<=29674) { print } }' variants.vcf > ORF10.vcfsplit
awk '{ if ($2>=29675 && $2<=29903) { print } }' variants.vcf > three_prime_utr.vcfsplit
# generating stats
echo "====> Generating stats: outside 97.5% CI"
echo ""
echo "five_prime_utr" >> variants_per_feature.txt
echo "leader_protein" >> variants_per_feature.txt
echo "nsp2" >> variants_per_feature.txt
echo "nsp3" >> variants_per_feature.txt
echo "nsp4" >> variants_per_feature.txt
echo "3C_like_proteinase" >> variants_per_feature.txt
echo "nsp6" >> variants_per_feature.txt
echo "nsp7" >> variants_per_feature.txt
echo "nsp8" >> variants_per_feature.txt
echo "nsp9" >> variants_per_feature.txt
echo "nsp10" >> variants_per_feature.txt
echo "RNA_dependent_RNA_polymerase" >> variants_per_feature.txt
echo "helicase" >> variants_per_feature.txt
echo "3_prime-to-5_prime_exonuclease" >> variants_per_feature.txt
echo "endoRNAse" >> variants_per_feature.txt
echo "2_prime-O-ribose_methyltransferase" >> variants_per_feature.txt
echo "S" >> variants_per_feature.txt
echo "ORF3a" >> variants_per_feature.txt
echo "E" >> variants_per_feature.txt
echo "M" >> variants_per_feature.txt
echo "ORF6" >> variants_per_feature.txt
echo "ORF7a" >> variants_per_feature.txt
echo "ORF7b" >> variants_per_feature.txt
echo "ORF8" >> variants_per_feature.txt
echo "N" >> variants_per_feature.txt
echo "ORF10" >> variants_per_feature.txt
echo "three_prime_utr" >> variants_per_feature.txt
cat five_prime_utr.vcfsplit | wc -l >> variants_per_feature.txt
cat leader_protein.vcfsplit | wc -l >> variants_per_feature.txt
cat nsp2.vcfsplit | wc -l >> variants_per_feature.txt
cat nsp3.vcfsplit | wc -l >> variants_per_feature.txt
cat nsp4.vcfsplit | wc -l >> variants_per_feature.txt
cat 3C_like_proteinase.vcfsplit | wc -l >> variants_per_feature.txt
cat nsp6.vcfsplit | wc -l >> variants_per_feature.txt
cat nsp7.vcfsplit | wc -l >> variants_per_feature.txt
cat nsp8.vcfsplit | wc -l >> variants_per_feature.txt
cat nsp9.vcfsplit | wc -l >> variants_per_feature.txt
cat nsp10.vcfsplit | wc -l >> variants_per_feature.txt
cat RNA_dependent_RNA_polymerase.vcfsplit | wc -l >> variants_per_feature.txt
cat helicase.vcfsplit | wc -l >> variants_per_feature.txt
cat 3_prime-to-5_prime_exonuclease.vcfsplit | wc -l >> variants_per_feature.txt
cat endoRNAse.vcfsplit | wc -l >> variants_per_feature.txt
cat 2_prime-O-ribose_methyltransferase.vcfsplit | wc -l >> variants_per_feature.txt
cat S.vcfsplit | wc -l >> variants_per_feature.txt
cat ORF3a.vcfsplit | wc -l >> variants_per_feature.txt
cat E.vcfsplit | wc -l >> variants_per_feature.txt
cat M.vcfsplit | wc -l >> variants_per_feature.txt
cat ORF6.vcfsplit | wc -l >> variants_per_feature.txt
cat ORF7a.vcfsplit | wc -l >> variants_per_feature.txt
cat ORF7b.vcfsplit | wc -l >> variants_per_feature.txt
cat ORF8.vcfsplit | wc -l >> variants_per_feature.txt
cat N.vcfsplit | wc -l >> variants_per_feature.txt
cat ORF10.vcfsplit | wc -l >> variants_per_feature.txt
cat three_prime_utr.vcfsplit | wc -l >> variants_per_feature.txt
grep '[Aa-Zz]' variants_per_feature.txt > SARS-CoV-2_proteins
grep -v '[Aa-Zz]' variants_per_feature.txt > variants_per_sample
paste SARS-CoV-2_proteins variants_per_sample > variants_per_feature
rm SARS-CoV-2_proteins variants_per_sample variants_per_feature.txt
mv variants_per_feature variants_per_feature_97.5-CI.tabular
# Replacing chromosome name with gene names
echo "====> Replacing chromosome name with gene names: outside 97.5% CI"
echo ""
sed -i 's/NC_045512.2/five_prime_utr/'g five_prime_utr.vcfsplit
sed -i 's/NC_045512.2/leader_protein/'g leader_protein.vcfsplit
sed -i 's/NC_045512.2/nsp2/'g nsp2.vcfsplit
sed -i 's/NC_045512.2/nsp3/'g nsp3.vcfsplit
sed -i 's/NC_045512.2/nsp4/'g nsp4.vcfsplit
sed -i 's/NC_045512.2/3C_like_proteinase/'g 3C_like_proteinase.vcfsplit
sed -i 's/NC_045512.2/nsp6/'g nsp6.vcfsplit
sed -i 's/NC_045512.2/nsp7/'g nsp7.vcfsplit
sed -i 's/NC_045512.2/nsp8/'g nsp8.vcfsplit
sed -i 's/NC_045512.2/nsp9/'g nsp9.vcfsplit
sed -i 's/NC_045512.2/nsp10/'g nsp10.vcfsplit
sed -i 's/NC_045512.2/RNA_dependent_RNA_polymerase/'g RNA_dependent_RNA_polymerase.vcfsplit
sed -i 's/NC_045512.2/helicase/'g helicase.vcfsplit
sed -i 's/NC_045512.2/3_prime-to-5_prime_exonuclease/'g 3_prime-to-5_prime_exonuclease.vcfsplit
sed -i 's/NC_045512.2/endoRNAse/'g endoRNAse.vcfsplit
sed -i 's/NC_045512.2/2_prime-O-ribose_methyltransferase/'g 2_prime-O-ribose_methyltransferase.vcfsplit
sed -i 's/NC_045512.2/S/'g S.vcfsplit
sed -i 's/NC_045512.2/ORF3a/'g ORF3a.vcfsplit
sed -i 's/NC_045512.2/E/'g E.vcfsplit
sed -i 's/NC_045512.2/M/'g M.vcfsplit
sed -i 's/NC_045512.2/ORF6/'g ORF6.vcfsplit
sed -i 's/NC_045512.2/ORF7a/'g ORF7a.vcfsplit
sed -i 's/NC_045512.2/ORF7b/'g ORF7b.vcfsplit
sed -i 's/NC_045512.2/ORF8/'g ORF8.vcfsplit
sed -i 's/NC_045512.2/N/'g N.vcfsplit
sed -i 's/NC_045512.2/ORF10/'g ORF10.vcfsplit
sed -i 's/NC_045512.2/three_prime_utr/'g three_prime_utr.vcfsplit
# tweaking 97.5_CI_confidence.recode.vcf file replacing chromosome with proteins-feature names
echo "====> Tweaking 97.5_CI_confidence.recode.vcf file replacing chromosome with proteins-feature names"
echo ""
cat vcfheader five_prime_utr.vcfsplit leader_protein.vcfsplit nsp2.vcfsplit nsp3.vcfsplit nsp4.vcfsplit 3C_like_proteinase.vcfsplit nsp6.vcfsplit nsp7.vcfsplit nsp8.vcfsplit nsp9.vcfsplit nsp10.vcfsplit RNA_dependent_RNA_polymerase.vcfsplit helicase.vcfsplit 3_prime-to-5_prime_exonuclease.vcfsplit endoRNAse.vcfsplit 2_prime-O-ribose_methyltransferase.vcfsplit S.vcfsplit ORF3a.vcfsplit E.vcfsplit M.vcfsplit ORF6.vcfsplit ORF7a.vcfsplit ORF7b.vcfsplit ORF8.vcfsplit N.vcfsplit ORF10.vcfsplit three_prime_utr.vcfsplit > 97.5_CI_confidence.tweaked.vcf
rm *vcfsplit variants.vcf vcfheader
cd ..
cp ./vcfsplit-temporal-directory/2.5_CI_confidence.tweaked.vcf ./
cp ./vcfsplit-temporal-directory/97.5_CI_confidence.tweaked.vcf ./
cp ./vcfsplit-temporal-directory/variants_per_feature_2.5-CI.tabular ./
cp ./vcfsplit-temporal-directory/variants_per_feature_97.5-CI.tabular ./
rm -rf vcfsplit-temporal-directory
# invoking vcfstats on tweaked vcf files
echo "====> Invoking vcfstats on tweaked vcf files"
echo ""
echo "====> Working in postprocessing_pi_D_output_files"
echo ""
mkdir postprocessing_pi_D_output_files
# cleaning up
echo ""
echo "====> Cleaning up .."
echo ""
gzip 2.5_CI_confidence.tweaked.vcf 2.5_CI_confidence.recode.vcf 97.5_CI_confidence.tweaked.vcf 97.5_CI_confidence.recode.vcf
mv 2.5_CI_confidence.tweaked.vcf.gz 2.5_CI_confidence.recode.vcf.gz 97.5_CI_confidence.tweaked.vcf.gz 97.5_CI_confidence.recode.vcf.gz variants_per_feature_2.5-CI.tabular variants_per_feature_97.5-CI.tabular bins_2.5%_confidence.bed bins_97.5%_confidence.bed pi_tajima.pdf ./postprocessing_pi_D_output_files/
echo "##################"
echo "##################"
echo ""
echo "All done. Check postprocessing_pi_D_output_files folder containing output files"
echo ""
echo "bins_2.5%_confidence.bed contains bins with Tajima's D values out of 2.5% CI. Check pi_tajima.pdf plot"
echo ""
echo "bins_97.5%_confidence.bed contains bins with Tajima's D values out of 97.5% CI. Check pi_tajima.pdf plot"
echo ""
echo "2.5_CI_confidence.recode.vcf.gz contain sites in Tajima's D values out of 2.5% CI"
echo ""
echo "97.5_CI_confidence.recode.vcf.gz contain sites in Tajima's D values out of 97.5% CI"
echo ""
echo "A tweaked version of the latter files conatining protein names instead of chromosome name are called 2.5 and 97.5_CI_confidence.tweaked.vcf.gz, respectively"
echo ""
echo "variants_per_feature_2.5-CI.tabular contain variants falling in bins with Tajima's D values out of 2.5% CI"
echo ""
echo "variants_per_feature_97.5-CI.tabular contain variants falling in bins with Tajima's D values out of 97.5% CI"
echo ""
echo "##################"
echo "##################"
#
} | tee logfile_tajima_d_postprocessing
#