-
Notifications
You must be signed in to change notification settings - Fork 18
/
kmer.py
66 lines (51 loc) · 2.27 KB
/
kmer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
class kmer_featurization:
def __init__(self, k):
"""
seqs: a list of DNA sequences
k: the "k" in k-mer
"""
self.k = k
self.letters = ['A', 'T', 'C', 'G']
self.multiplyBy = 4 ** np.arange(k-1, -1, -1) # the multiplying number for each digit position in the k-number system
self.n = 4**k # number of possible k-mers
def obtain_kmer_feature_for_a_list_of_sequences(self, seqs, write_number_of_occurrences=False):
"""
Given a list of m DNA sequences, return a 2-d array with shape (m, 4**k) for the 1-hot representation of the kmer features.
Args:
write_number_of_occurrences:
a boolean. If False, then in the 1-hot representation, the percentage of the occurrence of a kmer will be recorded; otherwise the number of occurrences will be recorded. Default False.
"""
kmer_features = []
for seq in seqs:
this_kmer_feature = self.obtain_kmer_feature_for_one_sequence(seq.upper(), write_number_of_occurrences=write_number_of_occurrences)
kmer_features.append(this_kmer_feature)
kmer_features = np.array(kmer_features)
return kmer_features
def obtain_kmer_feature_for_one_sequence(self, seq, write_number_of_occurrences=False):
"""
Given a DNA sequence, return the 1-hot representation of its kmer feature.
Args:
seq:
a string, a DNA sequence
write_number_of_occurrences:
a boolean. If False, then in the 1-hot representation, the percentage of the occurrence of a kmer will be recorded; otherwise the number of occurrences will be recorded. Default False.
"""
number_of_kmers = len(seq) - self.k + 1
kmer_feature = np.zeros(self.n)
for i in range(number_of_kmers):
this_kmer = seq[i:(i+self.k)]
this_numbering = self.kmer_numbering_for_one_kmer(this_kmer)
kmer_feature[this_numbering] += 1
if not write_number_of_occurrences:
kmer_feature = kmer_feature / number_of_kmers
return kmer_feature
def kmer_numbering_for_one_kmer(self, kmer):
"""
Given a k-mer, return its numbering (the 0-based position in 1-hot representation)
"""
digits = []
for letter in kmer:
digits.append(self.letters.index(letter))
digits = np.array(digits)
numbering = (digits * self.multiplyBy).sum()
return numbering