forked from Czh3/NGSTools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ann_GeneOntology.py
executable file
·115 lines (83 loc) · 2.17 KB
/
ann_GeneOntology.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
import sys
import os
# add gene ontology anntation tab divided files.
# zhangchao3@hotmail.com
def usage():
print '\nUSAGE <for cuffdiff output>:\npython %s infile outfile' % sys.argv[0]
exit()
def safeOpen(file, mode='r'):
if file.endswith('.gz'):
import gzip
return gzip.open(file, mode)
else:
return open(file, mode)
def loadGeneAliases():
'''load gene aliases '''
geneSymbol = {}
for id in safeOpen('/home/zhangc/database/gene_aliases_20131231.gz'):
cols = id.split('\t')
geneSymbol[cols[1]] = cols[0]
return geneSymbol
def loadGO():
'''load gene ontology database'''
GOdatabase = {}
#key: gene
#value: F:GO term|C:GO term
for gene in safeOpen('/home/zhangc/database/ATH_GO_GOSLIM.txt.gz'):
cols = gene.split('\t')
if GOdatabase.has_key(cols[0]):
GOdatabase[cols[0]] += '|' + cols[7] + ':' + cols[4]
else:
GOdatabase[cols[0]] = cols[7] + ':' + cols[4]
# reform database
for key in GOdatabase:
value = []
for term in GOdatabase[key].split('|'):
value.append(term)
GOdatabase[key] = '|'.join(set(value))
return GOdatabase
def GOannotate(GOdatabase, gene):
''' GO annotation'''
try:
transGene = geneSymbol[gene]
except:
transGene = gene
try:
GOannotation = GOdatabase[transGene]
except:
GOannotation = 'NULL'
print 'WARNING: "%s" has no GO annotation term' % gene
return GOannotation
if __name__ == '__main__':
##Init
# the Number of gene col ## 0 base
GENEIDCOL = 2
#check input file
if len(sys.argv) != 3:
usage()
infile = sys.argv[1]
if not os.path.exists(infile):
usage()
# load databse for gene ID transformate
geneSymbol = loadGeneAliases()
# load GO database
GOdatabase = loadGO()
# output file
out = open(sys.argv[2], 'w')
for line in safeOpen(infile):
cols = line.strip().split('\t')
gene = cols[GENEIDCOL]
if gene == '-':
out.write('\t'.join(cols) + '\n')
continue
if gene.find(',') == -1:
# not find ','
GOannotation = GOannotate(GOdatabase, gene)
cols.append(GOannotation)
else:
for g in gene.split(','):
GOannotation = GOannotate(GOdatabase, g)
cols.append(GOannotation)
out.write('\t'.join(cols) + '\n')
out.close()