-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathflashtext.go
367 lines (311 loc) · 8.79 KB
/
flashtext.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
package flashtext
import (
"bufio"
"fmt"
"log"
"os"
"strings"
)
const separator string = "=>"
type TrieNode struct {
selfRune rune
children map[rune]*TrieNode
isWord bool
cleanWord string
keep bool
key string
}
func newTrieNode() *TrieNode {
return &TrieNode{
children: make(map[rune]*TrieNode),
}
}
type FlashKeywords struct {
root *TrieNode
size int // nbr of keys
nbrNodes int
caseSensitive bool
}
// Instantiate a new Instance of the `FlashKeywords` with
// a case sensitive true or false
func NewFlashKeywords(caseSensitive bool) *FlashKeywords {
return &FlashKeywords{
root: newTrieNode(),
nbrNodes: 1,
caseSensitive: caseSensitive,
}
}
// Returns the number of the keys inside the keys dictionary
func (tree *FlashKeywords) Size() int {
return tree.size
}
// Returns a map of all the keys in the trie with their `cleanWord`
func (tree *FlashKeywords) GetAllKeywords() map[string]string {
key2Clean := make(map[string]string, tree.size)
stack := make([]*TrieNode, 0, tree.nbrNodes)
stack = append(stack, tree.root)
for len(stack) > 0 {
node := stack[len(stack)-1]
stack = stack[:len(stack)-1]
if node.isWord {
key2Clean[node.key] = node.cleanWord
}
for _, child := range node.children {
stack = append(stack, child)
}
}
return key2Clean
}
func (tree *FlashKeywords) addKeyWord(word string, cleanWord string) {
if !tree.caseSensitive {
word = strings.ToLower(word)
cleanWord = strings.ToLower(cleanWord)
}
currentNode := tree.root
for _, char := range word {
if currentNode.isWord {
currentNode.keep = true
}
if _, ok := currentNode.children[char]; !ok {
currentNode.children[char] = newTrieNode()
tree.nbrNodes++
}
currentNode = currentNode.children[char]
currentNode.selfRune = char
}
if !currentNode.isWord {
tree.size++
currentNode.isWord = true
if len(currentNode.children) != 0 {
currentNode.keep = true
}
currentNode.key = word
currentNode.cleanWord = cleanWord
} else if cleanWord != "" {
if currentNode.cleanWord != "" {
log.Printf("Warning: overwrite the clean word of %s from %s to %s",
word, currentNode.cleanWord, cleanWord)
}
currentNode.cleanWord = cleanWord
}
}
// Add the key `word` into the trie
func (tree *FlashKeywords) Add(word string) {
tree.addKeyWord(word, "")
}
// Add the key `word` into the trie with the corresponding `cleanWord`
func (tree *FlashKeywords) AddKeyWord(word string, cleanWord string) {
tree.addKeyWord(word, cleanWord)
}
// Add Multiple Keywords simultaneously from a map example:
//
// keyword_dict = {
// "java": ["java_2e", "java programing"],
// "product management": ["PM", "product manager"]
// }
// trie.AddFromMap(keyword_dict)
func (tree *FlashKeywords) AddFromMap(keys2synonyms map[string][]string) {
for key, listSynonyms := range keys2synonyms {
for _, synonym := range listSynonyms {
tree.addKeyWord(synonym, key)
}
}
}
// Add Multiple Keywords simultaneously from a file by providing the `filePath`
func (tree *FlashKeywords) AddFromFile(filePath string) error {
file, err := os.Open(filePath)
if err != nil {
log.Fatal(err)
return err
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
line := scanner.Text()
synonym2key := strings.Split(line, separator)
if len(synonym2key) == 2 {
tree.addKeyWord(synonym2key[0], synonym2key[1])
} else if len(synonym2key) == 1 {
tree.addKeyWord(synonym2key[0], "")
} else {
log.Printf("Skipped malformed line %s. The Correct format is: key1=>key2", line)
}
}
return nil
}
// Returns the corresponding `cleanWord` for the key `word` from the trie
func (tree *FlashKeywords) GetKeysWord(word string) (string, error) {
currentNode := tree.root
for _, char := range word {
currentNode = currentNode.children[char]
if currentNode == nil {
return "", fmt.Errorf("the word %s doesn't exists in the keywords dictionnary", word)
}
}
if !currentNode.isWord {
return "", fmt.Errorf("the word %s doesn't exists in the keywords dictionnary", word)
}
return currentNode.cleanWord, nil
}
// Check if the key `word` exists in the trie dictionary
func (tree *FlashKeywords) Contains(word string) bool {
currentNode := tree.root
for _, char := range word {
currentNode = currentNode.children[char]
if currentNode == nil {
return false
}
}
return currentNode.isWord
}
// Remove the key `word` from the trie dictionary
func (tree *FlashKeywords) RemoveKey(word string) bool {
var nextNode *TrieNode
parent := make(map[*TrieNode]*TrieNode)
currentNode := tree.root
for _, currChar := range word {
if _, ok := currentNode.children[currChar]; !ok {
return false
}
nextNode = currentNode.children[currChar]
parent[nextNode] = currentNode
currentNode = nextNode
}
if !currentNode.isWord {
return false
}
currentNode.isWord = false
tree.size--
var (
parentNode *TrieNode
childRune rune
)
for currentNode != tree.root && len(currentNode.children) == 0 && !currentNode.isWord {
parentNode = parent[currentNode]
childRune = currentNode.selfRune
currentNode = nil
tree.nbrNodes--
delete(parentNode.children, childRune)
currentNode = parentNode
}
return true
}
// the resulting output struct:
// - `Key`: the string keyword found in the search text
// - `IsPrefix` (false/true): indicates if the key A is a prefix of another string(key B)
// where A and B are both in the dictionary of the flash keywords
// - `CleanWord`: the string with which the found key will be replaced in the text.
// We can think of it also like the origin word of the synonym found in the text.
// - `Start & End`: span information about the start and end indexes if the key found in the text
type Result struct {
Key string
IsPrefix bool // support for key the smallest(the prefix) and the longest match
CleanWord string
Start int
End int
}
// Search in the text for the stored keys in the trie and
// returns a slice of `Result`
func (tree *FlashKeywords) Search(text string) []Result {
if !tree.caseSensitive {
text = strings.ToLower(text)
}
n := len(text)
var res []Result
currentNode := tree.root
start := 0
for idx, char := range text {
currentNode = currentNode.children[char]
if currentNode == nil {
currentNode = tree.root
start = idx + 1
} else {
if currentNode.isWord {
isPrefix := false
if currentNode.keep {
// possibility to be a prefix of another continous word
if idx+1 < n {
if _, ok := currentNode.children[rune(text[idx+1])]; ok {
isPrefix = true
}
}
}
res = append(res, Result{
Key: currentNode.key,
IsPrefix: isPrefix,
CleanWord: currentNode.cleanWord,
Start: start,
End: idx,
})
if !isPrefix {
// go back to root with 2 conditions (see TestGoBackToRootTrick):
// - simple one if keep=false (isPrefix=false by default)
// - keep can be true but when we look one step ahead
// no node is founded => Go back to root
currentNode = tree.root
start = idx + 1
}
}
}
}
return res
}
// Replace the keys found in the text with their `cleanWord` if it exists
// and returns a new string with the replaced keys
func (tree *FlashKeywords) Replace(text string) string {
if !tree.caseSensitive {
text = strings.ToLower(text)
}
buf := make([]rune, 0, len(text))
bufSize := 0
currentNode := tree.root
// track the tail of the buf to know if append or set with a new rune buf[lastChange]
// in the case lenght of key is different from lenght of cleanWord
lastChange := 0
for idx, char := range text {
if lastChange < bufSize {
buf[lastChange] = char
lastChange++
} else {
buf = append(buf, char)
bufSize++
lastChange = bufSize
}
currentNode = currentNode.children[char]
if currentNode == nil {
currentNode = tree.root
} else if currentNode.isWord {
if currentNode.cleanWord != "" {
// repalce opp `leftmost match first`(replace key with the cleanWord)
runeKeySize := len([]rune(currentNode.key))
start := bufSize - runeKeySize
lastChange = start
for _, cChar := range currentNode.cleanWord {
if start < bufSize {
buf[start] = cChar
start++
lastChange++
} else {
buf = append(buf, cChar)
bufSize++
lastChange = bufSize
start = bufSize
}
}
// done with replacement Go back to root
currentNode = tree.root
} else if currentNode.keep {
// in case the currentNode(isWord) doesn't have a cleanWord
// worth to check if it is a prefix of another `big word`
// to make the replacement opp on the `big word`
if _, ok := currentNode.children[rune(text[idx+1])]; !ok {
// nothing found go back to root
currentNode = tree.root
}
} else {
currentNode = tree.root
}
}
}
return string(buf[:lastChange])
}