L2_analysis/L2_analysis.sh

# It contains analysis of L2 elements in the tuatara genome, four parts included in this script:
# 2a) L2 classification
# 2b) L2 phylogeny
# 2c) Analysis of L2 based on the RT domain
# 2d) L2 divergence rate


#------------------------------------------#
#### 2a) L2 classification
#------------------------------------------#

# Extract full-length L2 elements (2-4kb)
perl extract_full_length.pl tua_V3.lib.fa.L2
perl extract_full_length.pl rmk_tuaV3.fa.L2

# Since CARP doesn't take the strand direction into consideration, 
# we orrected the direction of consensus sequences before building a tree

# grep the title of full-length L2 families
grep '>' full-length.tua_V3.lib.fa.L2 > title.full-length.tua_V3.lib.fa.L2
awk -F ':' '{print $1}' title.full-length.tua_V3.lib.fa.L2 > title.full-length.tua_V3.lib.fa.L2.use
sed 's/>//g' title.full-length.tua_V3.lib.fa.L2.use > title.full-length.tua_V3.lib.fa.L2.use2

# Extract strand directions from CENSOR output
./grep_censor.sh title.full-length.tua_V3.lib.fa.L2.use2 > strand.title.full-length.tua_V3.lib.fa.L2

# Extract complement families and their corresponding consensus sequences
perl divide-+.pl strand.title.full-length.tua_V3.lib.fa.L2
perl extract_fasta_seq.pl strand.title.full-length.tua_V3.lib.fa.L2_comple full-length.tua_V3.lib.fa.L2 > comple.full-length.tua_V3.lib.fa.L2
perl extract_fasta_seq.pl strand.title.full-length.tua_V3.lib.fa.L2_forward full-length.tua_V3.lib.fa.L2 > forward.full-length.tua_V3.lib.fa.L2
 
# Reverse the complement sequences
perl reverse_complement.pl comple.full-length.tua_V3.lib.fa.L2 > reverse.full-length.tua_V3.lib.fa.L2

# Merge all full-length L2 consensus sequences
cat reverse.full-length.tua_V3.lib.fa.L2 forward.full-length.tua_V3.lib.fa.L2 full-length.rmk_tuaV3.fa.L2 > tua_L2_tree.fa

# Run muscle alignment
muscle -in tua_L2_tree.fa -out tua_L2_tree.afa -maxiters 2

# Use gblocks to remove the bad alignment
Gblocks tua_L2_tree.afa -t=d -p=n -e=.gb -b5=h
tr -d " \t" < tua_L2_tree.afa.gb > tua_L2_tree.afa.gbHalf.afa

# Build tree
fasttree -nt < tua_L2_tree.afa.gbHalf.afa > gblock_tua_L2_tree


#------------------------------------------#
#### 2b) L2 phylogeny
#------------------------------------------#

# Download vertebrates L2 consensus sequences from Repbase, and extract the full-length ones
perl space_to_star.pl Repbase.L2.fa > Repbase.L2.fa.use
perl extract_full_length.pl Repbase.L2.fa.use

# Since CARP doesn't take the strand direction into consideration, we corrected the direction of consensus sequences before building a tree, 
# Also, we have run CARP on other five species, which their full-length L2 elements are included in this analysis

# First, extract the L2 family name from species tested by CARP, including chicken, platypus, opossum, bearded dragon, anole and tuatara
# $1 represents species's name

# grep the title of full-length L2 families
grep '>' full-length.$1.fa.L2 > title.full-length.$1.fa.L2
awk -F ':' '{print $1}' title.full-length.$1.fa.L2 > title.full-length.$1.fa.L2.use
sed 's/>//g' title.full-length.$1.fa.L2.use > title.full-length.$1.fa.L2.use2

# Extract strand directions from CENSOR output
./grep_censor.sh title.full-length.$1.fa.L2.use2 > strand.full-length.$1.fa.L2

# Extract complement families and their corresponding consensus sequences
perl divide-+.pl strand.full-length.$1.fa.L2
perl extract_fasta_seq.pl strand.full-length.$1.fa.L2_comple full-length.$1.fa.L2 > comple.full-length.$1.fa.L2
perl extract_fasta_seq.pl strand.full-length.$1.fa.L2_forward full-length.$1.fa.L2 > forward.full-length.$1.fa.L2
 
# Reverse the complement sequences
perl reverse_complement.pl comple.full-length.$1.fa.L2 > reverse.full-length.$1.fa.L2

# rename five species consensus sequences, e.g.
sed -i 's/consensus/tuatara/g' forward.full-length.tuatara.fa.L2 > forward.full-length.tuatara.fa.L2
sed -i 's/consensus/tuatara/g' reverse.full-length.tuatara.fa.L2 > reverse.full-length.tuatara.fa.L2

# Merge all full-length L2 consensus sequences
cat reverse.* forward.* full-length.rmk_tuaV3.fa.L2 full-length.Repbase.L2.fa.use > L2_tree.fasta

# Run muscle alignment
muscle -in L2_tree.fasta -out L2_tree.afa -maxiters 2

# Use gblocks to remove the bad alignment
Gblocks L2_tree.afa -t=d -p=n -e=.gb -b5=h
tr -d " \t" < L2_tree.afa.gb > L2_tree.afa.gbHalf.afa

# Build tree
fasttree -nt < L2_tree.afa.gbHalf.afa > gblock_L2_tree


#------------------------------------------#
#### 2c) Analysis of L2 based on the RT domain
#------------------------------------------#

# RepeatMaskererge all full-length L2 from six species we generated by using CARP
cat reverse.* forward.* > combined.full-length.L2.fa

# Identify ORF2p sequences (>500aa)
usearch -fastx_findorfs combined.full-length.L2.fa -ntout combined.full-length.L2.fa.out -aaout combined.full-length.L2.fa.translated -orfstyle 7 -mincodons 500

# Run muscle alignment
muscle -in combined.full-length.L2.fa.translated -out combined_L2_RT_domain.afa -maxiters 2

# Use gblocks to remove the bad alignment
Gblocks combined_L2_RT_domain.afa -t=d -p=n -e=.gb -b5=h
tr -d " \t" < combined_L2_RT_domain.afa.gb > combined_L2_RT_domain.afa.gbHalf.afa

# Build tree
fasttree -nt < combined_L2_RT_domain.afa.gbHalf.afa > gblock_L2_RT_domain_tree


#------------------------------------------#
#### 2d) Possible horizontal transfer between tuatara and platypus
#------------------------------------------#

# Separate tuatara L2 into platypus-like (HT)/non-platypus-like (non-HT) based on the tree build from step 1, which results to 22 HT L2 and 33 non-HT L2.

# run CENSOR against five reptile genomes (anole, crocodile, alligator, turtle and bearded dragon) and one monotreme genome (platypus)
# $1 represents the specie name
cd censor_ht
censor -bprm cpus=16 -lib ht.L2.fa $1.genome.fa

cd censor_nonht
censor -bprm cpus=16 -lib nonht.L2.fa $1.genome.fa

# Extract nucleotide sequences from CENSOR hits respectively
# platypus-like L2 
for i in *.map;
do
    echo $i
    awk '{if(($3-$2+1)>50) print $0}' $i > $i.use
    awk '{$1 "\t" $2 "\t" $3}' $i.use > L2.ht.$i.bed
done

fastaFromBed -fi ../$1.genome.fa -bed L2.ht.$1.bed -fo L2.ht.$1.fa

# non-platypus-like L2
for i in *.map;
do
    echo $i
    awk '{if(($3-$2+1)>50) print $0}' $i > $i.use
    awk '{$1 "\t" $2 "\t" $3}' $i.use > L2.nonht.$i.bed
done

fastaFromBed -fi ../$1.genome.fa -bed L2.nonht.$1.bed -fo L2.nonht.$1.fa

# Merge hits from same species
cat L2.ht.$1.fa L2.nonht.$1.fa > L2.ht-nonht.$1.fa

# Run BLASTN against the platypus-like tuatara L2 and non-platypus like tuatara L2 consensus sequences
# Hits smaller than 50bp were discarded
cat ht.L2.fa nonht.L2.fa > L2.cons.fa

blastn -subject L2.ht-nonht.$1.fa -query L2.cons.fa -outfmt 6 -out blastn.L2.ht-nonht.$1

# Remove the short coordinates from BLASTN outputs
for i in blastn.*; 
do
   echo $i
   awk '{if(($4)>50) print}' $i > large_$i
   sort -k2,2  -k11,11n  large_$i | sort -u -k2,2 --merge > best.$i
   grep 'nonht' best.$i > nonht.best.$i
   sed '/nonht/d' best.$i > ht.best.$i
done


#------------------------------------------#
#### 2e) L2 divergence rate
#------------------------------------------#

# Calculate divergence rate of platypus-like L2 tuatara elements and non-platypus-like L2 tuatara elements
RepeatMasker -pa 16 -a -nolow -lib L2.cons.fa tuatara_30Sep2015_rUdWx.fasta

# Get the length of full-length tuatara L2 consensus sequences
awk '$0 ~ ">" {if (NR > 1) {print c;} c=0;printf substr($0,2,100) "\t"; } $0 !~ ">" {c+=length($0);} END { print c; }' L2.cons.fa > length_L2.cons.fa

# Extract kimura substitution rate
grep '^Kimura' tuatara_30Sep2015_rUdWx.fasta.align > kimura.rate.txt

# Extract genome coordinates from align output
grep '^[0-9]' tuatara_30Sep2015_rUdWx.fasta.align > align.coord.txt
awk '{if($9=="C") print $5"\t"$10"\t"$12"\t"$13; else print $5"\t"$9"\t"$10"\t"$11}' align.coord.txt > tmp.align.coord.txt
sed -i 's/#Unspecified//g' tmp.align.coord.txt

# Make the start coordinate is smaller than the stop
awk '{if($3<$4) len=$4-$3+1; else len=$3-$4+1}{print $0"\t"len}' tmp.align.coord.txt > use.align.coord.txt
paste use.align.coord.txt kimura.rate.txt > align.coordi.kimura.txt

# Add L2 family length to the file
perl merge_same_fam.pl length_L2.cons.fa align.coordi.kimura.txt > tmp

# Remove not wanted string
sed -i 's/Kimura (with divCpGMod) = //g' tmp

# Extract the length of alignments is >90% of the full-length L2
awk '{if(($5/$8)>=0.9) print}' tmp > long.align.coordi.kimura.txt
sed -i 's/ht_.*/ht/g; s/nonht_.*/nonht/g' long.align.coordi.kimura.txt
awk '{print $6}' tmp > tmp2 
paste long.align.coordi.kimura.txt tmp2  > hc.kimura.rate.txt