-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcreate_go_annots_sprot_trembl.py
executable file
·110 lines (88 loc) · 4.01 KB
/
create_go_annots_sprot_trembl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python3
"""
> create_go_annots_sprot_trembl.py <
Based on the blastp results of the transcripts vs. sprot/trembl,
create a GO annotation file for the transcripts.
Assumes input files follow the standard pattern of "xxxx_vs_sprot.tGO.tsv" and
"xxxx_vs_trembl.tGO.tsv".
Hit threshold = 1e-5
Workflow goes:
1. If there's a hit from Swiss-Prot below the threshold, take it.
2. If not, check TrEMBL.
3. nr doesn't have GO annotations... too bad.
Program uses the tabular form of the blastp results (parse_blast_xml.py's
--table), not the raw XML format.
"""
import argparse
import re
import natural_sort
import parse_go_obo
parser = argparse.ArgumentParser(description="""
Based on the blastp results of the transcripts vs. sprot/trembl,
create a GO annotation file for the transcripts.""")
parser.add_argument('species', metavar="species_code",
help="3/4-letter code for species in question.")
parser.add_argument('-n', '--no_parents', action='store_true',
help='suppresses the search for parent GO terms.')
args = parser.parse_args()
E_VALUE_THRESHOLD = 1e-5
sprot_tsv = open('{}_vs_sprot.tGO.tsv'.format(args.species))
trembl_tsv = open('{}_vs_trembl.tGO.tsv'.format(args.species))
# NOTE: UniProtKB IDs can now be in 6-char or 10-char format. script _can_ deal
# with both. create dictionary to store UniProtKB ID of the homologue that is
# the most similar to the transcripts
# transcript_uniprot_id[transcript_name] = 'XXXXXX'
transcript_uniprot_id = {}
for line in sprot_tsv:
# exclude header line
if 'sp|' in line:
cols = line.split('\t')
transcript_uniprot_id[cols[0]] = re.search('sp\|(\w+)\|', cols[1]).group(1)
for line in trembl_tsv:
# exclude header line
if 'tr|' in line:
cols = line.split('\t')
if cols[0] not in transcript_uniprot_id:
transcript_uniprot_id[cols[0]] = re.search('tr\|(\w+)\|', cols[1]).group(1)
all_uniprot_id = set(transcript_uniprot_id.values())
goa_tsv = open('/lithium/data_repo/goa/goa_uniprot_all.parsed.gaf')
# create dictionary to translate the six-character UniProt ID into GO terms
goa_data = {}
for line in goa_tsv:
if 'GO' in line:
cols = line.strip().split('\t')
if cols[0] in all_uniprot_id:
if cols[0] not in goa_data:
goa_data[cols[0]] = []
goa_data[cols[0]].append(cols[1])
# search for parent GO terms for each child term
associated_go_terms = {}
for t in transcript_uniprot_id:
expanded_go_terms = []
# not all six-char UniProt IDs have GO terms assigned to them
if transcript_uniprot_id[t] in goa_data:
if args.no_parents:
# suppress search for parent GO terms
expanded_go_terms = goa_data[transcript_uniprot_id[t]]
else:
for u in goa_data[transcript_uniprot_id[t]]:
expanded_go_terms += parse_go_obo.get_all_parent_terms(u)
associated_go_terms[t] = sorted(list(set(expanded_go_terms)))
# to create three separate files, only with BP, CC OR MF terms.
binned_go_terms = {}
for t in associated_go_terms:
binned_go_terms[t] = parse_go_obo.bin_go_terms_by_namespace(associated_go_terms[t])
# print it out in a list
with open('{}_go_annots.{}all.tsv'.format(args.species, 'no_parents.' if args.no_parents else ''), 'w') as f:
for t in natural_sort.natural_sort(associated_go_terms):
if associated_go_terms[t]:
# only print transcripts with non-empty GO term lists
print (t, ','.join(associated_go_terms[t]), sep='\t', file=f)
bins = ['biological_process', 'cellular_component', 'molecular_function']
for b in bins:
abbrev_fn = ''.join([x[0] for x in b.split('_')])
with open('{}_go_annots.{}{}.tsv'.format(args.species, 'no_parents.' if args.no_parents else '', abbrev_fn), 'w') as f:
for t in natural_sort.natural_sort(binned_go_terms):
if binned_go_terms[t][b]:
# only print transcripts with non-empty GO term lists
print (t, ','.join(binned_go_terms[t][b]), sep='\t', file=f)