-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathparse_uniprot_dat.py
executable file
·54 lines (45 loc) · 1.66 KB
/
parse_uniprot_dat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/env python3
"""
> parse_uniprot_dat.py <
Python script to parse through UniProt's *.dat files, and then output sequences
in chunks till '//' is encountered.
Script requires a _file object_ (not filename) and filetype.
"""
import re
def iter_sequences(dat_file):
"""
A generator that handles UniProt *.dat that follows the pattern of:
ID xxxx (starting line)
... (random stuff)
// (ending line)
"""
data = ''
for line in dat_file:
if line[:2] == '//':
yield data
data = ''
else:
data += line
def parse_uniprot_seq(seq):
id = re.search('^ID\s+(\w+_\w+)', seq, re.M).group(1)
ac = re.search('^AC\s+(\w+);', seq, re.M).group(1)
src = re.search('^DT.*?UniProtKB/([\w|-]+)', seq, re.M).group(1)
sv = re.search('sequence version (\d+)', seq).group(1)
de = re.search('[Rec|Sub]Name: Full=(.*?);', seq).group(1)
try:
gn = re.search('^GN\s+\w+=(\w+)', seq, re.M).group(1)
except:
gn = ''
os = re.findall('^OS\s+(.*?)\n', seq, re.M)
oc = re.findall('^OC\s+(.*?)\n', seq, re.M)
pe = re.search('^PE\s+(\d+)', seq, re.M).group(1)
kw = re.findall('^KW\s+(.*?)\n', seq, re.M)
sq = re.search('^SQ.*;(.*)', seq, re.DOTALL + re.M).group(1)
# postprocessing
os = ' '.join(os)[:-1] # removes the '.' at the end of the string
oc = ' '.join(oc)[:-1]
kw = ' '.join(kw)
sq = sq.replace(' ', '').replace('\n', '')
parsed_data = {'id': id, 'ac': ac, 'src': src, 'sv': sv, 'de': de,
'gn': gn, 'os': os, 'oc': oc, 'pe': pe, 'kw': kw, 'sq': sq}
return parsed_data