-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathL2_analysis.sh
210 lines (149 loc) · 8.44 KB
/
L2_analysis.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# It contains analysis of L2 elements in the tuatara genome, four parts included in this script:
# 2a) L2 classification
# 2b) L2 phylogeny
# 2c) Analysis of L2 based on the RT domain
# 2d) L2 divergence rate
#------------------------------------------#
#### 2a) L2 classification
#------------------------------------------#
# Extract full-length L2 elements (2-4kb)
perl extract_full_length.pl tua_V3.lib.fa.L2
perl extract_full_length.pl rmk_tuaV3.fa.L2
# Since CARP doesn't take the strand direction into consideration,
# we orrected the direction of consensus sequences before building a tree
# grep the title of full-length L2 families
grep '>' full-length.tua_V3.lib.fa.L2 > title.full-length.tua_V3.lib.fa.L2
awk -F ':' '{print $1}' title.full-length.tua_V3.lib.fa.L2 > title.full-length.tua_V3.lib.fa.L2.use
sed 's/>//g' title.full-length.tua_V3.lib.fa.L2.use > title.full-length.tua_V3.lib.fa.L2.use2
# Extract strand directions from CENSOR output
./grep_censor.sh title.full-length.tua_V3.lib.fa.L2.use2 > strand.title.full-length.tua_V3.lib.fa.L2
# Extract complement families and their corresponding consensus sequences
perl divide-+.pl strand.title.full-length.tua_V3.lib.fa.L2
perl extract_fasta_seq.pl strand.title.full-length.tua_V3.lib.fa.L2_comple full-length.tua_V3.lib.fa.L2 > comple.full-length.tua_V3.lib.fa.L2
perl extract_fasta_seq.pl strand.title.full-length.tua_V3.lib.fa.L2_forward full-length.tua_V3.lib.fa.L2 > forward.full-length.tua_V3.lib.fa.L2
# Reverse the complement sequences
perl reverse_complement.pl comple.full-length.tua_V3.lib.fa.L2 > reverse.full-length.tua_V3.lib.fa.L2
# Merge all full-length L2 consensus sequences
cat reverse.full-length.tua_V3.lib.fa.L2 forward.full-length.tua_V3.lib.fa.L2 full-length.rmk_tuaV3.fa.L2 > tua_L2_tree.fa
# Run muscle alignment
muscle -in tua_L2_tree.fa -out tua_L2_tree.afa -maxiters 2
# Use gblocks to remove the bad alignment
Gblocks tua_L2_tree.afa -t=d -p=n -e=.gb -b5=h
tr -d " \t" < tua_L2_tree.afa.gb > tua_L2_tree.afa.gbHalf.afa
# Build tree
fasttree -nt < tua_L2_tree.afa.gbHalf.afa > gblock_tua_L2_tree
#------------------------------------------#
#### 2b) L2 phylogeny
#------------------------------------------#
# Download vertebrates L2 consensus sequences from Repbase, and extract the full-length ones
perl space_to_star.pl Repbase.L2.fa > Repbase.L2.fa.use
perl extract_full_length.pl Repbase.L2.fa.use
# Since CARP doesn't take the strand direction into consideration, we corrected the direction of consensus sequences before building a tree,
# Also, we have run CARP on other five species, which their full-length L2 elements are included in this analysis
# First, extract the L2 family name from species tested by CARP, including chicken, platypus, opossum, bearded dragon, anole and tuatara
# $1 represents species's name
# grep the title of full-length L2 families
grep '>' full-length.$1.fa.L2 > title.full-length.$1.fa.L2
awk -F ':' '{print $1}' title.full-length.$1.fa.L2 > title.full-length.$1.fa.L2.use
sed 's/>//g' title.full-length.$1.fa.L2.use > title.full-length.$1.fa.L2.use2
# Extract strand directions from CENSOR output
./grep_censor.sh title.full-length.$1.fa.L2.use2 > strand.full-length.$1.fa.L2
# Extract complement families and their corresponding consensus sequences
perl divide-+.pl strand.full-length.$1.fa.L2
perl extract_fasta_seq.pl strand.full-length.$1.fa.L2_comple full-length.$1.fa.L2 > comple.full-length.$1.fa.L2
perl extract_fasta_seq.pl strand.full-length.$1.fa.L2_forward full-length.$1.fa.L2 > forward.full-length.$1.fa.L2
# Reverse the complement sequences
perl reverse_complement.pl comple.full-length.$1.fa.L2 > reverse.full-length.$1.fa.L2
# rename five species consensus sequences, e.g.
sed -i 's/consensus/tuatara/g' forward.full-length.tuatara.fa.L2 > forward.full-length.tuatara.fa.L2
sed -i 's/consensus/tuatara/g' reverse.full-length.tuatara.fa.L2 > reverse.full-length.tuatara.fa.L2
# Merge all full-length L2 consensus sequences
cat reverse.* forward.* full-length.rmk_tuaV3.fa.L2 full-length.Repbase.L2.fa.use > L2_tree.fasta
# Run muscle alignment
muscle -in L2_tree.fasta -out L2_tree.afa -maxiters 2
# Use gblocks to remove the bad alignment
Gblocks L2_tree.afa -t=d -p=n -e=.gb -b5=h
tr -d " \t" < L2_tree.afa.gb > L2_tree.afa.gbHalf.afa
# Build tree
fasttree -nt < L2_tree.afa.gbHalf.afa > gblock_L2_tree
#------------------------------------------#
#### 2c) Analysis of L2 based on the RT domain
#------------------------------------------#
# RepeatMaskererge all full-length L2 from six species we generated by using CARP
cat reverse.* forward.* > combined.full-length.L2.fa
# Identify ORF2p sequences (>500aa)
usearch -fastx_findorfs combined.full-length.L2.fa -ntout combined.full-length.L2.fa.out -aaout combined.full-length.L2.fa.translated -orfstyle 7 -mincodons 500
# Run muscle alignment
muscle -in combined.full-length.L2.fa.translated -out combined_L2_RT_domain.afa -maxiters 2
# Use gblocks to remove the bad alignment
Gblocks combined_L2_RT_domain.afa -t=d -p=n -e=.gb -b5=h
tr -d " \t" < combined_L2_RT_domain.afa.gb > combined_L2_RT_domain.afa.gbHalf.afa
# Build tree
fasttree -nt < combined_L2_RT_domain.afa.gbHalf.afa > gblock_L2_RT_domain_tree
#------------------------------------------#
#### 2d) Possible horizontal transfer between tuatara and platypus
#------------------------------------------#
# Separate tuatara L2 into platypus-like (HT)/non-platypus-like (non-HT) based on the tree build from step 1, which results to 22 HT L2 and 33 non-HT L2.
# run CENSOR against five reptile genomes (anole, crocodile, alligator, turtle and bearded dragon) and one monotreme genome (platypus)
# $1 represents the specie name
cd censor_ht
censor -bprm cpus=16 -lib ht.L2.fa $1.genome.fa
cd censor_nonht
censor -bprm cpus=16 -lib nonht.L2.fa $1.genome.fa
# Extract nucleotide sequences from CENSOR hits respectively
# platypus-like L2
for i in *.map;
do
echo $i
awk '{if(($3-$2+1)>50) print $0}' $i > $i.use
awk '{$1 "\t" $2 "\t" $3}' $i.use > L2.ht.$i.bed
done
fastaFromBed -fi ../$1.genome.fa -bed L2.ht.$1.bed -fo L2.ht.$1.fa
# non-platypus-like L2
for i in *.map;
do
echo $i
awk '{if(($3-$2+1)>50) print $0}' $i > $i.use
awk '{$1 "\t" $2 "\t" $3}' $i.use > L2.nonht.$i.bed
done
fastaFromBed -fi ../$1.genome.fa -bed L2.nonht.$1.bed -fo L2.nonht.$1.fa
# Merge hits from same species
cat L2.ht.$1.fa L2.nonht.$1.fa > L2.ht-nonht.$1.fa
# Run BLASTN against the platypus-like tuatara L2 and non-platypus like tuatara L2 consensus sequences
# Hits smaller than 50bp were discarded
cat ht.L2.fa nonht.L2.fa > L2.cons.fa
blastn -subject L2.ht-nonht.$1.fa -query L2.cons.fa -outfmt 6 -out blastn.L2.ht-nonht.$1
# Remove the short coordinates from BLASTN outputs
for i in blastn.*;
do
echo $i
awk '{if(($4)>50) print}' $i > large_$i
sort -k2,2 -k11,11n large_$i | sort -u -k2,2 --merge > best.$i
grep 'nonht' best.$i > nonht.best.$i
sed '/nonht/d' best.$i > ht.best.$i
done
#------------------------------------------#
#### 2e) L2 divergence rate
#------------------------------------------#
# Calculate divergence rate of platypus-like L2 tuatara elements and non-platypus-like L2 tuatara elements
RepeatMasker -pa 16 -a -nolow -lib L2.cons.fa tuatara_30Sep2015_rUdWx.fasta
# Get the length of full-length tuatara L2 consensus sequences
awk '$0 ~ ">" {if (NR > 1) {print c;} c=0;printf substr($0,2,100) "\t"; } $0 !~ ">" {c+=length($0);} END { print c; }' L2.cons.fa > length_L2.cons.fa
# Extract kimura substitution rate
grep '^Kimura' tuatara_30Sep2015_rUdWx.fasta.align > kimura.rate.txt
# Extract genome coordinates from align output
grep '^[0-9]' tuatara_30Sep2015_rUdWx.fasta.align > align.coord.txt
awk '{if($9=="C") print $5"\t"$10"\t"$12"\t"$13; else print $5"\t"$9"\t"$10"\t"$11}' align.coord.txt > tmp.align.coord.txt
sed -i 's/#Unspecified//g' tmp.align.coord.txt
# Make the start coordinate is smaller than the stop
awk '{if($3<$4) len=$4-$3+1; else len=$3-$4+1}{print $0"\t"len}' tmp.align.coord.txt > use.align.coord.txt
paste use.align.coord.txt kimura.rate.txt > align.coordi.kimura.txt
# Add L2 family length to the file
perl merge_same_fam.pl length_L2.cons.fa align.coordi.kimura.txt > tmp
# Remove not wanted string
sed -i 's/Kimura (with divCpGMod) = //g' tmp
# Extract the length of alignments is >90% of the full-length L2
awk '{if(($5/$8)>=0.9) print}' tmp > long.align.coordi.kimura.txt
sed -i 's/ht_.*/ht/g; s/nonht_.*/nonht/g' long.align.coordi.kimura.txt
awk '{print $6}' tmp > tmp2
paste long.align.coordi.kimura.txt tmp2 > hc.kimura.rate.txt