-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathblast.py
66 lines (55 loc) · 1.53 KB
/
blast.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import gzip
import sys
def read_bl2seq(filename): # only tested for BLASTN
if filename.endswith('.gz'):
fp = gzip.open(filename, 'rt')
else:
fp = open(filename)
score = None
percent = None
qalign = []
salign = []
for line in fp:
f = line.split()
if line.startswith(' Score'):
if score: break # get the first alignment
score = float(f[2])
if line.startswith(' Identities'):
n, l = f[2].split('/')
percent = int(n) / int(l)
if line.startswith('Query:'):
qalign.append(f[2])
if line.startswith('Sbjct:'):
salign.append(f[2])
fp.close()
return score, percent, ''.join(qalign).upper(), ''.join(salign).upper()
def find_gaps(s1, s2):
gapstrings = []
off = 0
while True:
beg = s1.find('-', off)
if beg == -1: break
for end in range(beg +1, len(s1)):
if s1[end] != '-': break
off = end + 1
gapstrings.append(s2[beg:end])
return gapstrings
score, pct, qa, sa = read_bl2seq('gapped.blastn')
print(score, pct)
# example of how subs could be counted
change = {}
for q, s in zip(qa, sa):
if q == s: continue
if q == 'N' or s == 'N': continue
if q == '-' or s == '-': continue
if q not in change: change[q] = {}
if s not in change[q]: change[q][s] = 0
change[q][s] += 1
import json
print(json.dumps(change, indent=4))
# how to get gaps
print(find_gaps(qa, sa))
print(find_gaps(sa, qa))
# note that this is a simple method that doesn't take into account the
# phylogenetic relationships of the sequences: shared mutations will end
# double counted or worse. we shouldn't model all changes as independent