-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNo187_RepeatedDNASequence.cpp
161 lines (127 loc) · 5.07 KB
/
No187_RepeatedDNASequence.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
// 187. Repeated DNA Sequences
// All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
// Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
// For example,
// Given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT",
// Return:
// ["AAAAACCCCC", "CCCCCAAAAA"].
// Directly usage of Rabin Karp, it needs some improvment to store the hash value
class Solution {
public:
int rabinKarp(string text, string pattern){
assert(text.size() >= pattern.size());
int res = 0;
const int prime = 101;
const int base = 256;
int t_hash = 0;
int p_hash = 0;
int update_hash = 1;
// update_hash means how many times base you times during calculate the hash value
for(int i=0; i<pattern.size() - 1; i++){
update_hash = update_hash * base % prime;
}
for(int i=0; i<pattern.size(); i++){
p_hash = ((p_hash * base % prime ) + pattern[i] ) % prime;
t_hash = ((t_hash * base % prime) + text[i]) % prime;
}
// Here need to <=, in order to do the last comparsion
for(int j=0; j<=text.size() - pattern.size(); j++){
// If equal hash value, it has probability to be the same
if(t_hash == p_hash){
int k = 0;
for(k=0; k<pattern.size(); k++){
if(pattern[k] != text[j+k]) break;
}
if(k == pattern.size()){
res++;
}
}
// If the hash value is not equal, update the hash value by
// rolling hash
// Add prime to avoid the negative hash value
if(j < text.size() - pattern.size()){
t_hash = ((t_hash - update_hash * text[j])
* base % prime + text[j+pattern.size()]) % prime;
if(t_hash < 0) t_hash += prime;
}
}
return res;
}
vector<string> findRepeatedDnaSequences(string s) {
set<string> res;
if(s.size() < 10) return vector<string>(res.begin(), res.end());
string pattern;
for(int i=0; i<s.size()-10; i++){
pattern = s.substr(i, 10);
if(res.count(pattern) == 0 && rabinKarp(s, pattern) > 1){
res.insert(pattern);
}
}
return vector<string>(res.begin(), res.end());
}
};
bool sameString(string& s, unordered_multimap<int,string> multi_hash, int key){
auto range = multi_hash.equal_range(key);
for(auto iter = range.first; iter != range.second; iter++){
if (s == iter->second){
return true;
}
}
return false;
}
class Solution {
public:
vector<string> findRepeatedDnaSequences(string s) {
const int base = 256;
const int prime = 101;
const int pattern_length = 10;
int update_hash = 1;
int p_hash = 0;
// set<pair<int, string>> unique_hash;
unordered_multimap<int, string> unique_hash;
unordered_set<string> repeated_string;
// Update_hash calculation
for(int i=0; i<pattern_length-1; i++){
update_hash = update_hash * base % prime;
}
for(int i=0; i<s.size(); i++){
if(i < pattern_length)
p_hash = (p_hash * base + s[i]) % prime;
// update p_hash
if (i > pattern_length - 1){
p_hash = ((p_hash - update_hash * s[i-pattern_length]) * base + s[i]) % prime;
if(p_hash < 0) p_hash += prime;
//cout << "every hash value " << p_hash << endl;
}
if (i > pattern_length - 2){
string sub_string = s.substr(i-pattern_length+1, pattern_length);
if(!unique_hash.count(p_hash) || (unique_hash.count(p_hash) && !sameString(sub_string, unique_hash, p_hash))) {
unique_hash.insert(make_pair(p_hash, s.substr(i-pattern_length+1, pattern_length)));
//cout << "set hash " << p_hash << endl;
}
else {
repeated_string.insert(s.substr(i-pattern_length+1, pattern_length));
//cout << "p_hash " << p_hash << " string " << s.substr(i-pattern_length+1, pattern_length) << endl;
}
}
}
vector<string> res(repeated_string.begin(), repeated_string.end());
return res;
}
};
// No use of hash, directly store the string, pass
class Solution {
public:
vector<string> findRepeatedDnaSequences(string s) {
unordered_set<string> res, unique;
const int pattern_length = 10;
for(int i=0; i + pattern_length <= s.size(); i++){
string tmp = s.substr(i, pattern_length);
if(unique.count(tmp)) res.insert(tmp);
else{
unique.insert(tmp);
}
}
return vector<string>(res.begin(), res.end());
}
};