-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.go
51 lines (39 loc) · 949 Bytes
/
tokenizer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
// Copyright 2018 Axel Etcheverry. All rights reserved.
// Use of this source code is governed by a MIT
// license that can be found in the LICENSE file.
package ngram
import "unicode/utf8"
// Tokenizer interface
type Tokenizer interface {
Tokenize(content string) []string
}
type tokenizer struct {
size SizeType
}
// New N-gram Tokenizer
func New(size SizeType) Tokenizer {
return &tokenizer{
size: size,
}
}
func (t tokenizer) Tokenize(content string) []string {
length := len(content)
size := int(t.size)
runes := make([]int, length+1)
ridx := 0 // rune index
bidx := 0 // byte index
for i, w := 0, 0; i < length; i += w {
_, width := utf8.DecodeRuneInString(content[bidx:])
runes[ridx] = bidx
bidx += width
w = width
ridx++
}
runes[ridx] = len(content)
tokens := make([]string, ridx+1-size)
for i := 0; i <= ridx-size; i++ {
end := i + size
tokens[i] = content[runes[i]:runes[end]]
}
return tokens
}