-
Notifications
You must be signed in to change notification settings - Fork 4
/
create_benchmark.py
116 lines (110 loc) · 4.18 KB
/
create_benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/python
'''
Program: creating CAFA benchmark
Author : Huy Nguyen
Start : 05/31/2017
End : 05/31/2017
'''
import os
import sys
import argparse
from Bio.UniProt import GOA
'''
argument parsers
'''
def get_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("--t1","-t1",help="t1 file")
parser.add_argument("--t2","-t2", help="t2 file")
parser.add_argument("--output","-o",help="output directory to store the 6 benchmark file")
args = parser.parse_args()
return args
'''
function : given a file handle, parse in using gaf format and return a dictionary
that identify those protein with experimental evidence and the ontology
input : file text
output : dic (key: name of file (number), value is a big dictionary store info about the protein)
'''
def read_gaf(handle):
name = handle.split(".")[-1]
dic = {}
all_protein_name = set()
# evidence from experimental
Evidence = {'Evidence': set(['EXP','IDA','IPI','IMP','IGI','IEP'])}
with open(handle, 'r') as handle:
for rec in GOA.gafiterator(handle):
all_protein_name.add(rec['DB_Object_ID'])
if GOA.record_has(rec, Evidence) and rec['DB'] == 'UniProtKB':
if rec['DB_Object_ID'] not in dic:
dic[rec['DB_Object_ID']] = {rec['Aspect']:set([rec['GO_ID']])}
else:
if rec['Aspect'] not in dic[rec['DB_Object_ID']]:
dic[rec['DB_Object_ID']][rec['Aspect']]=set([rec['GO_ID']])
else:
dic[rec['DB_Object_ID']][rec['Aspect']].add(rec['GO_ID'])
return name,dic ,all_protein_name
'''
function : given t1 dic, t2 dic, we provide the dic for NK, and LK dic for each ontology
input : 2 dics
output : NK,LK dictionary
'''
def analyze(t1_dic,t2_dic,all_protein_t1):
NK_dic = {'P':{},'C':{},'F':{}}
LK_dic = {'P':{},'C':{},'F':{}}
# dealing with NK and LK
for protein in t2_dic:
## check the protein in t2_dic but not appear in t1
if protein not in t1_dic and protein in all_protein_t1: ## this going to be in NK
### check which ontology got new annotated
for ontology in t2_dic[protein]:
NK_dic[ontology][protein] = t2_dic[protein][ontology]
## check the protein that in t2_dic and appear in t1
elif protein in t1_dic :
## check if in t1, this protein does not have all 3 ontology
### if yes, then not include since full knowledge
### else
if len(t1_dic[protein]) < 3:
#### check if t2_dic include in the ontology that t1 lack of
for ontology in t2_dic[protein]:
if ontology not in t1_dic[protein]: # for those lack, include in LK
LK_dic[ontology][protein] = t2_dic[protein][ontology]
return NK_dic,LK_dic
'''
function : given NK,LK dic , write out 6 files
input : 2 dics
output : NK,LK dictionary
'''
def write_file(dic,knowledge,name):
for ontology in dic:
if ontology =='F':
final_name = name+knowledge+'_mfo'
elif ontology =='P':
final_name = name+knowledge+'_bpo'
elif ontology =='C':
final_name = name+knowledge+'_cco'
print "Writing {} file".format(final_name)
file_out = open(final_name,'w')
for protein in sorted(dic[ontology]):
for annotation in dic[ontology][protein]:
file_out.write(protein +'\t'+annotation+'\n')
file_out.close()
return None
if __name__ == "__main__":
args = get_arguments()
t1 = args.t1
t2 = args.t2
outdir = args.output
t1_name,t1_dic,all_protein_t1 = read_gaf(t1)
# print "t1 \n",t1_dic
t2_name,t2_dic,all_protein_t2 = read_gaf(t2)
# print "t2 \n",t2_dic
NK_dic,LK_dic= analyze(t1_dic,t2_dic,all_protein_t1)
try:
os.mkdir(outdir)
except:
print "The directory has already been created"
num1 = t1.split('.')[-1]
num2 = t2.split('.')[-1]
name = outdir+'/'+'.'.join(t1.split('/')[-1].split('.')[:-1])+'.'+num2+'-'+num1+'_benchmark_'
write_file(NK_dic,'NK',name)
write_file(LK_dic,'LK',name)