-
Notifications
You must be signed in to change notification settings - Fork 0
/
AnnoGenerate.py
119 lines (104 loc) · 7.91 KB
/
AnnoGenerate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import sys
import subprocess
import HTSeq, time, argparse, re
from Bio.Emboss.Applications import NeedleCommandline
from Bio import AlignIO
from pybedtools import BedTool
import pandas as pd
parser = argparse.ArgumentParser(
description = 'This script takes in an annotation file of human Alu and\
MIR elements and the corresponding human reference genome sequence\
to produce the index for SINEs_find.py. The index file is the annotation \
input file with added columns reporting the lenght, start and end of \
the alignment of the element sequence to its consensus sequence \
on the reference genome',
epilog = 'Written by Davide Carnevali davide.carnevali@crg.eu')
parser.add_argument("annotation", help="Annotation file in GTF format \
(either gzipped or not). Should refer to the same version \
of the human reference genome file")
parser.add_argument("genome", help="Human reference genome sequence. Should refer \
to the same version of the annotation file")
parser.add_argument("output", help="output filename")
args = parser.parse_args()
genome = BedTool(args.genome)
MIR = 'ACAGTATAGCATAGTGGTTAAGAGCACGGACTCTGGAGCCAGACTGCCTGGGTTCGAATCCCGGCTCTGCCACTTACTAGCTGTGTGACCTTGGGCAAGTTACTTAACCTCTCTGTGCCTCAGTTTCCTCATCTGTAAAATGGGGATAATAATAGTACCTACCTCATAGGGTTGTTGTGAGGATTAAATGAGTTAATACATGTAAAGCGCTTAGAACAGTGCCTGGCACATAGTAAGCGCTCAATAAATGTTGGTTATTA'
MIRb = 'CAGAGGGGCAGCGTGGTGCAGTGGAAAGAGCACGGGCTTTGGAGTCAGGCAGACCTGGGTTCGAATCCTGGCTCTGCCACTTACTAGCTGTGTGACCTTGGGCAAGTCACTTAACCTCTCTGAGCCTCAGTTTCCTCATCTGTAAAATGGGGATAATAATACCTACCTCGCAGGGTTGTTGTGAGGATTAAATGAGATAATGCATGTAAAGCGCTTAGCACAGTGCCTGGCACACAGTAAGCGCTCAATAAATGGTAGCTCTATTATT'
MIRc = 'CGAGGCAGTGTGGTGCAGTGGAAAGAGCACTGGACTTGGAGTCAGGAAGACCTGGGTTCGAGTCCTGGCTCTGCCACTTACTAGCTGTGTGACCTTGGGCAAGTCACTTAACCTCTCTGAGCCTCAGTTTCCTCATCTGTAAAATGGGGATAATAATACCTGCCCTGCCTACCTCACAGGGTTGTTGTGAGGATCAAATGAGATAATGTATGTGAAAGCGCTTTGTAAACTGTAAAGTGCTATACAAATGTAAGGGGTTATTATTATT'
MIR3 = 'TTCTGGAAGCAGTATGGTATAGTGGAAAGAACAACTGGACTAGGAGTCAGGAGACCTGGGTTCTAGTCCTAGCTCTGCCACTAACTAGCTGTGTGACCTTGGGCAAGTCACTTCACCTCTCTGGGCCTCAGTTTTCCTCATCTGTAAAATGAGNGGGTTGGACTAGATGATCTCTAAGGTCCCTTCCAGCTCTAACATTCTATGATTCTATGATTCTAAAAAAA'
MIR1_Amn = 'TAGGGAGGCAGTGTGGTCTAGTGGATAGAGCACTGGACTGGGACTCGGGAGACCTGGGTTCNANTCCCGGCTCTGCCACTNGCCNGCTGNGTGACCTTGGGCAAGTCACTTNACCTCTCTGNGCCTCAGTTTCCTCATCTGTAAAATGGGGATAATGATACTGACCTCCTTTGTAAAGTGCTTTGAGATCTACTGATGAAAAGTGCTACATAAGAGCTAGGTATTATTAT'
ALU = 'GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGAGGATTGCTTGAGCCCAGGAGTTCGAGACCAGCCTGGGCAACATAGCGAGACCCCGTCTCTACAAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAGTCCCAGCTACTCGGGAGGCTGAGGCAGGAGGATCGCTTGAGCCCAGGAGTTCGAGGCTGCAGTGAGCTATGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACCCTGTCTCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'
### TO DO: FAM/FLAM/FRAM IMPLEMENTATION ###
#FAM = 'GCCGGGCGCGGTGGCGCGCGCCTGTAGTCCCAGCTACTCGGGAGGCTGAGGCGGGAGGATCGCTTGAGCCCAGGAGTTCGAGGCTGTAGTGCGCTATGATCGCGCCTGTGAATAGCCACTGCACTCCAGCCTGAGCAACATAGCGAGACCCCGTCTCTTAAAAAAAAA'
#FLAM = 'GCCGGGCGCGGTGGCGCGCGCCTGTAGTCCCAGCTACTCGGGAGGCTGAGGCGGGAGGATCGCTTGAGCCCAGGAGTTCGAGACCAGCCTGGGCAACATAGCGAGACCCCGTCTCTAAAAAAAA'
#FRAM = 'GCCGGGCGCGGTGGCGCGCGCCTGTAGTCCCAGCTACTCGGGAGGCTGAGGCGGGAGGATCGCTTGAGCCCAGGAGTTCGAGGCTGCAGTGAGCTATGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACCCCGTCTCAAAAAAAAAA'
start_time = time.time()
annotation = HTSeq.GFF_Reader(args.annotation)
alu_list = []
char = re.compile('-*')
char2 = re.compile('[-NATGCatgc]*')
def needle(chrom, start, end, name, score, strand):
tmpfile = open("tmpaln.txt", 'w')
item=BedTool([(chrom, start, end, name, score, strand)])
item = item.sequence(fi=genome, s=True)
temp = open(item.seqfn).read().split('\n')[1]
if name == "MIRb":
sine_length = 269
needle_cline = NeedleCommandline(asequence="asis:"+MIRb, bsequence="asis:"+temp,gapopen=10, gapextend=0.5, outfile='stdout')
child = subprocess.Popen(str(needle_cline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=(sys.platform!="win32"))
child.wait()
align = AlignIO.read(tmpfile.name, "emboss")
aln_start = char.search(str(align[1, :].seq)).end()
aln_end = char2.search(str(align[1, :].seq)).end()
elif name == "MIRc":
sine_length = 269
needle_cline = NeedleCommandline(asequence="asis:"+MIRc, bsequence="asis:"+temp,gapopen=10, gapextend=0.5, outfile='stdout')
child = subprocess.Popen(str(needle_cline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=(sys.platform!="win32"))
child.wait()
align = AlignIO.read(tmpfile.name, "emboss")
aln_start = char.search(str(align[1, :].seq)).end()
aln_end = char2.search(str(align[1, :].seq)).end()
elif name == "MIR3":
sine_length = 225
needle_cline = NeedleCommandline(asequence="asis:"+MIR3, bsequence="asis:"+temp,gapopen=10, gapextend=0.5, outfile='stdout')
child = subprocess.Popen(str(needle_cline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=(sys.platform!="win32"))
child.wait()
align = AlignIO.read(tmpfile.name, "emboss")
aln_start = char.search(str(align[1, :].seq)).end()
aln_end = char2.search(str(align[1, :].seq)).end()
elif name == "MIR1_Amn":
sine_length = 231
needle_cline = NeedleCommandline(asequence="asis:"+MIR3, bsequence="asis:"+temp,gapopen=10, gapextend=0.5, outfile='stdout')
child = subprocess.Popen(str(needle_cline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=(sys.platform!="win32"))
child.wait()
align = AlignIO.read(tmpfile.name, "emboss")
aln_start = char.search(str(align[1, :].seq)).end()
aln_end = char2.search(str(align[1, :].seq)).end()
elif name == "MIR":
sine_length = 261
needle_cline = NeedleCommandline(asequence="asis:"+MIR, bsequence="asis:"+temp,gapopen=10, gapextend=0.5, outfile='stdout')
child = subprocess.Popen(str(needle_cline), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=(sys.platform!="win32"))
child.wait()
align = AlignIO.read(tmpfile.name, "emboss")
aln_start = char.search(str(align[1, :].seq)).end()
aln_end = char2.search(str(align[1, :].seq)).end()
elif "Alu" in name:
sine_length = 313
needle_cline = NeedleCommandline(asequence="asis:"+ALU, bsequence="asis:"+temp,gapopen=10, gapextend=0.5, outfile='stdout')
child = subprocess.Popen(str(needle_cline), stdout=tmpfile, stderr=subprocess.PIPE, shell=(sys.platform!="win32"))
child.wait()
align = AlignIO.read(tmpfile.name, "emboss")
aln_start = char.search(str(align[1, :].seq)).end()
aln_end = char2.search(str(align[1, :].seq)).end()
tmpfile.close()
return (aln_start, aln_end, sine_length)
# To mantain GTF notation, which is 1 based, we add +1 to element.iv.start
for element in annotation:
aln_start, aln_end, sine_length = needle(element.iv.chrom, element.iv.start, element.iv.end, element.attr['gene_id'], int(element.score), element.iv.strand)
if element.iv.strand == "+":
alu_list.append([element.iv.chrom, element.source, "exon", element.iv.start + 1, element.iv.end, "0", element.iv.strand, ".", "gene_id " + "\""+element.attr['transcript_id']+"\"; " + "transcript_id " + "\""+element.attr['transcript_id']+"\";",element.iv.start - aln_start, (element.iv.start - aln_start) + aln_end])
else:
alu_list.append([element.iv.chrom, element.source, "exon", element.iv.start + 1, element.iv.end, "0", element.iv.strand, ".", "gene_id " + "\""+element.attr['transcript_id']+"\"; " + "transcript_id " + "\""+element.attr['transcript_id']+"\";",(element.iv.end + aln_start) - aln_end, element.iv.end + aln_start])
final_list = pd.DataFrame(alu_list)
final_list.to_csv(args.output,sep="\t", header=False,index=False,doublequote=False,quotechar='\'',escapechar='')
print("Finished!")
print("Time elapsed {}".format(time.time() - start_time))