-
Notifications
You must be signed in to change notification settings - Fork 0
/
wk4c.py
142 lines (121 loc) · 4.64 KB
/
wk4c.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import itertools
from time import time
def fastq_to_reads(file):
with open(file, 'r') as f:
reads = f.readlines()
reads = [r.strip() for r in reads[1::4]]
return reads
cache = {}
def overlap(a, b, min_length=3):
''' return length of longest suffix of 'a' matching
a prefix of 'b' that is at least length 'min_length'
characters long. Return 0 if no such overlap exists. '''
if (a, b, min_length) in cache:
return cache[(a, b, min_length)]
start = 0 # start all the way to the left
while True:
start = a.find(b[:min_length], start) # look for b's pref in a
if start == -1: # no more occurrences to the right
cache[(a, b, min_length)] = 0
return 0
# found occurrence, check for full suff/pref match
if b.startswith(a[start:]):
ret = len(a) - start
cache[(a, b, min_length)] = ret
return ret
start += 1 # move past previous match
def overlap_map(reads, k):
# reads = ['CGTACG', 'TACGTA', 'GTACGT', 'ACGTAC', 'GTACGA', 'TACGAT']
# For every k-mer in a read, we add the read to the set object corresponding
# to that k-mer
d = {} # kmer:set(reads)
for r in reads:
i = 0
for i in range(len(r) - k + 1):
kmer = r[i:i+k]
if kmer not in d:
d[kmer] = set()
d[kmer].add(r)
olaps = {}
for a in reads:
suff = a[-k:] # looking for suffix of 'a'
for b in d[suff]: # only check other reads that contain the suff of 'a'
if a == b:
continue
olen = overlap(a, b, min_length=k)
if olen >= k:
olaps[(a, b)] = olen
return olaps
def scs_list(ss):
''' return shortest common superstring of given
strings, which must be the same length '''
shortest_sup = [None]
for ssperm in itertools.permutations(ss):
sup = ssperm[0] # superstring starts as first string
for i in range(len(ss) - 1):
# overlap adjacent strings in the permutation
olen = overlap(ssperm[i], ssperm[i+1], min_length=1)
# add non-overlapping portion of b to sup
sup += ssperm[i+1][olen:]
if shortest_sup[0] is None or len(sup) < len(shortest_sup[0]):
shortest_sup = [sup]
else:
if len(sup) == len(shortest_sup[0]):
shortest_sup.append(sup)
return shortest_sup
def pick_maximal_overlap(reads, k):
''' Return a pair of reads from the list with a
maximal suffix/prefix overlap >= k. Returns
overlap length 0 if there are no such overlaps.'''
reada, readb = None, None
best_olen = 0
for a, b in itertools.permutations(reads, 2):
olen = overlap(a, b, min_length=k)
if olen > best_olen:
reada, readb = a, b
best_olen = olen
return reada, readb, best_olen
def greedy_scs(reads, k):
''' Greedy shortest-common-superstring merge.
Repeat until no edges (overlaps of length >= k)
remain. '''
read_a, read_b, olen = pick_maximal_overlap(reads, k)
while olen > 0:
reads.remove(read_a)
reads.remove(read_b)
reads.append(read_a + read_b[olen:])
read_a, read_b, olen = pick_maximal_overlap(reads, k)
return ''.join(reads)
def greedy_scs_from_olaps(reads, olaps):
'''Create a scs from a dict of (a, b):olen'''
olaps = [(k) + (v,) for k, v in olaps.items()] # convert dict to list of tuples
olaps = sorted(olaps, key=lambda x: x[2], reverse=True) # reverse sort list by olen
read_a, read_b, olen = olaps.pop(0)
while olen > 30:
if read_a not in reads or read_b not in reads:
continue
reads.remove(read_a)
reads.remove(read_b)
s = read_a + read_b[olen:]
reads.append(s)
# update olaps
olaps = overlap_map(reads, 30)
olaps = [(k) + (v,) for k, v in olaps.items()]
olaps = sorted(olaps, key=lambda x: x[2], reverse=True)
if len(olaps) > 0:
read_a, read_b, olen = olaps.pop(0)
else:
break
return ''.join(reads)
reads = fastq_to_reads('ads1_week4_reads.fq')
print("reads len:", len(reads))
reads = list(set(reads))
print("unique reads:", len(reads))
# olaps = overlap_map(reads, 8)
olaps = overlap_map(reads, 30)
print("len olaps", len(olaps))
# scs = greedy_scs(reads, 30)
# print("scs len:", len(scs), "num 'A':", scs.count('A'), "num 'T':", scs.count('T'))
# len:15894, A:4633, T:3723
scs = greedy_scs_from_olaps(reads, olaps)
print("scs len:", len(scs), "num 'A':", scs.count('A'), "num 'T':", scs.count('T'))