-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsuffix_automaton.cpp
201 lines (190 loc) · 5.7 KB
/
suffix_automaton.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
/* Suffix Automaton
*
* Intuitively a suffix automaton can be understood as a compressed form of all substrings of a given string
*
* Fields:
* - st: vector of states
* - len: length of the longest suffix in the subtree rooted at this state
* - link: suffix link
* - cnt: number of occurrences of the substring represented by the path from the root to this state
* - first_pos: position of the first occurrence of the substring represented by the path from the root to this state
* - repeated_cnt: number of substrings in the subtree rooted at this state
* - distinct_cnt: number of distinct substrings in the subtree rooted at this state
* - next: map from characters to the next states
* - sz: number of states
* - last: index of the last state
*
* Methods:
* - add(char c): add a character to the automaton
* - Time: O(1) amortized
* - find(const string &t): find the state corresponding to the substring t
* - Time: O(|t|)
* - is_substring(const string &t): check if t is a substring
* - Time: O(|t|)
* - count(const string &t): count the number of occurrences of t
* - Time: O(|t|)
* - first_pos(const string &t): find the position of the first occurrence of t
* - Time: O(|t|)
* - Returns -1 if t is not a substring
* - distinct_substrings(): count the number of distinct substrings
* - Time: O(n)
* - distinct_substrings_by_size(): count the number of distinct substrings of each length
* - Time: O(n)
* - kth_substring(ll k): find the k-th lexicographically smallest substring
* - Time: O(n)
* - kth_distinct_substring(ll k): find the k-th lexicographically smallest distinct substring
* - Time: O(n)
* - update_states_cnt(): update the cnt field of all states
* - Time: O(n*log(n))
* - Only needed to calculate the number of occurrences of a substring
* - update_states_kth_counts(): update the distinct_cnt and repeated_cnt field of all states
* - Time: O(n)
* - Only needed to calculate the kth substring (distinct or not)
*/
struct SuffixAutomaton {
struct state {
int len, link, cnt, first_pos;
ll repeated_cnt, distinct_cnt;
map<char, int> next;
};
vector<state> st;
int sz, last, n;
SuffixAutomaton(const string &s) : SuffixAutomaton(s, (int)s.size()<<1) {}
SuffixAutomaton(const string &s, int maxlen) : st(maxlen), sz(1), last(0), n(0) {
st[0].len = 0;
st[0].link = -1;
for (auto c : s) add(c);
update_states_cnt();
update_states_kth_counts();
}
void add(char c) {
++n;
int cur = sz++;
st[cur].len = st[last].len + 1;
st[cur].first_pos = st[cur].len - 1;
st[cur].cnt = 1;
int p = last;
while (p != -1 and !st[p].next.count(c)) {
st[p].next[c] = cur;
p = st[p].link;
}
if (p == -1) {
st[cur].link = 0;
} else {
int q = st[p].next[c];
if (st[p].len + 1 == st[q].len) {
st[cur].link = q;
} else {
int clone = sz++;
st[clone].len = st[p].len + 1;
st[clone].next = st[q].next;
st[clone].link = st[q].link;
st[clone].first_pos = st[q].first_pos;
while (p != -1 and st[p].next[c] == q) {
st[p].next[c] = clone;
p = st[p].link;
}
st[q].link = st[cur].link = clone;
}
}
last = cur;
}
optional<state> find(const string &t) const {
int cur = 0;
for (auto c : t) {
auto it = st[cur].next.find(c);
if (it == st[cur].next.end()) return {};
cur = it->second;
}
return st[cur];
}
inline bool is_substring(const string &t) const {
return find(t).has_value();
}
inline int count(const string &t) const {
return find(t).value_or(state()).cnt;
}
inline int first_pos(const string &t) const {
if (auto s = find(t)) return s->first_pos - (int)t.size() + 1;
else return -1;
}
ll distinct_substrings() const {
ll res = 0;
for (int i = 1; i < sz; i++)
res += st[i].len - st[st[i].link].len;
return res;
}
vector<ll> distinct_substrings_by_size() const {
vector<int> hs(sz, -1);
hs[0] = 0;
queue<int> q;
q.push(0);
int mx = 0;
while (!q.empty()) {
auto u = q.front(); q.pop();
for (auto [_, v] : st[u].next) if (hs[v] == -1) {
q.push(v);
hs[v] = hs[u] + 1;
mx = max(mx, st[v].len);
}
}
vector<ll> res(mx);
for (int i = 1; i < sz; i++) {
++res[hs[i]-1];
if (st[i].len < mx)
--res[st[i].len];
}
for (int i = 1; i < mx; i++)
res[i] += res[i-1];
return res;
}
// starts at 0
string kth_substring(ll k) {
k += n;
string res;
for (int u = 0; k >= st[u].cnt;) {
k -= st[u].cnt;
for (auto [c, v] : st[u].next) {
if (st[v].repeated_cnt > k) {
res += c; u = v;
break;
}
k -= st[v].repeated_cnt;
}
}
return res;
}
string kth_distinct_substring(ll k) {
string res;
for (int u = 0; k >= 0;) {
for (auto [c, v] : st[u].next) {
if (st[v].distinct_cnt > k) {
res += c; --k; u = v;
break;
}
k -= st[v].distinct_cnt;
}
}
return res;
}
void update_states_cnt() {
vector<pair<int, int>> order(sz-1);
for (int i = 1; i < sz; i++) order[i-1] = {st[i].len, i};
sort(rbegin(order), rend(order));
for (auto [_, i] : order) st[st[i].link].cnt += st[i].cnt;
}
void update_states_kth_counts() {
auto dfs = [&](auto &&self, int u) -> void {
auto &dis = st[u].distinct_cnt;
auto &rep = st[u].repeated_cnt;
dis = 1;
rep = st[u].cnt;
for (auto [_, v] : st[u].next) {
if (!st[v].distinct_cnt) self(self, v);
dis += st[v].distinct_cnt;
rep += st[v].repeated_cnt;
}
};
dfs(dfs, 0);
}
};