-
Notifications
You must be signed in to change notification settings - Fork 3
/
search.go
122 lines (108 loc) · 2.06 KB
/
search.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
// https://artem.krylysov.com/blog/2020/07/28/lets-build-a-full-text-search-engine/
package main
import (
"strings"
"unicode"
snowballeng "github.com/kljensen/snowball/english"
)
type index map[string][]int
func (idx index) add(titles []string) {
for id, title := range titles {
for _, token := range analyze(title) {
if contains(idx[token], id) {
continue
}
idx[token] = append(idx[token], id)
}
}
}
func analyze(text string) []string {
tokens := tokenize(text)
tokens = toLower(tokens)
tokens = removeCommonWords(tokens)
tokens = stem(tokens)
return tokens
}
func tokenize(text string) []string {
return strings.FieldsFunc(text, func(r rune) bool {
return !unicode.IsLetter(r) && !unicode.IsNumber(r)
})
}
func toLower(tokens []string) []string {
r := make([]string, len(tokens))
for i, token := range tokens {
r[i] = strings.ToLower(token)
}
return r
}
var stopWords = map[string]struct{}{
"a": {},
"and": {},
"be": {},
"have": {},
"i": {},
"in": {},
"of": {},
"that": {},
"the": {},
"to": {},
}
func removeCommonWords(tokens []string) []string {
r := make([]string, 0, len(tokens))
for _, token := range tokens {
if _, ok := stopWords[token]; !ok {
r = append(r, token)
}
}
return r
}
func stem(tokens []string) []string {
r := make([]string, len(tokens))
for i, token := range tokens {
r[i] = snowballeng.Stem(token, false)
}
return r
}
func contains(slice []int, val int) bool {
for _, s := range slice {
if s == val {
return true
}
}
return false
}
func intersection(a, b []int) []int {
maxLen := len(a)
if len(b) > maxLen {
maxLen = len(b)
}
r := make([]int, 0, maxLen)
var i, j int
for i < len(a) && j < len(b) {
if a[i] < b[j] {
i++
} else if a[i] > b[j] {
j++
} else {
r = append(r, a[i])
i++
j++
}
}
return r
}
func (idx index) search(text string) []int {
var r []int
for _, token := range analyze(text) {
if ids, ok := idx[token]; ok {
if r == nil {
r = ids
} else {
r = intersection(r, ids)
}
} else {
return nil
}
}
return r
}