-
Notifications
You must be signed in to change notification settings - Fork 0
/
wk4b.py
66 lines (54 loc) · 2.09 KB
/
wk4b.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import itertools
def fastq_to_reads(file):
with open(file, 'r') as f:
reads = f.readlines()
reads = [r.strip() for r in reads[1::4]]
return reads
cache = {}
def overlap(a, b, min_length=3):
''' return length of longest suffix of 'a' matching
a prefix of 'b' that is at least length 'min_length'
characters long. Return 0 if no such overlap exists. '''
if (a, b, min_length) in cache:
return cache[(a, b, min_length)]
start = 0 # start all the way to the left
while True:
start = a.find(b[:min_length], start) # look for b's pref in a
if start == -1: # no more occurrences to the right
cache[(a, b, min_length)] = 0
return 0
# found occurrence, check for full suff/pref match
if b.startswith(a[start:]):
ret = len(a) - start
cache[(a, b, min_length)] = ret
return ret
start += 1 # move past previous match
def pick_maximal_overlap(reads, k):
''' Return a pair of reads from the list with a
maximal suffix/prefix overlap >= k. Returns
overlap length 0 if there are no such overlaps.'''
reada, readb = None, None
best_olen = 0
for a, b in itertools.permutations(reads, 2):
olen = overlap(a, b, min_length=k)
if olen > best_olen:
reada, readb = a, b
best_olen = olen
return reada, readb, best_olen
def greedy_scs(reads, k):
''' Greedy shortest-common-superstring merge.
Repeat until no edges (overlaps of length >= k)
remain. '''
read_a, read_b, olen = pick_maximal_overlap(reads, k)
while olen > 0:
reads.remove(read_a)
reads.remove(read_b)
reads.append(read_a + read_b[olen:])
read_a, read_b, olen = pick_maximal_overlap(reads, k)
return ''.join(reads)
reads = fastq_to_reads('ads1_week4_reads.fq')
print("reads len:", len(reads))
print("unique reads:", len(set(reads)))
# scs = greedy_scs(reads, 30)
# print("scs len:", len(scs), "num 'A':", scs.count('A'), "num 'T':", scs.count('T'))
# len:15894, A:4633, T:3723