-
Notifications
You must be signed in to change notification settings - Fork 0
/
token_writer.go
175 lines (141 loc) · 3.38 KB
/
token_writer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
package datok
import (
"bufio"
"io"
"strconv"
)
type Bits uint8
// TODO-Perf:
// - TokenWriter may support AvailableBuffer(), so tokens can be written
// directly without a separate buffer. copying from the same underlying
// byte array is a nop thren (Go 1.18).
const (
TOKENS Bits = 1 << iota
SENTENCES
TOKEN_POS
SENTENCE_POS
NEWLINE_AFTER_EOT
SIMPLE = TOKENS | SENTENCES
)
type TokenWriter struct {
SentenceEnd func(int)
TextEnd func(int)
Flush func() error
Token func(int, []rune)
// Fail func(int)
}
// Create a new token writer based on the options
func NewTokenWriter(w io.Writer, flags Bits) *TokenWriter {
writer := bufio.NewWriter(w)
posC := 0
pos := make([]int, 0, 1024)
sentB := true
sent := make([]int, 0, 1024)
init := true
tw := &TokenWriter{}
// tw.Fail = func(_ int) {}
// Collect token positions and maybe tokens
if flags&(TOKEN_POS|SENTENCE_POS) != 0 {
// TODO:
// Split to
// - Token_pos+Tokens+Newline
// - Token_pos+Newline
// - Token_pos|Sentence_pos
// - Sentence_pos
// - Tokens
tw.Token = func(offset int, buf []rune) {
// TODO:
// Store in []uint16
// and write to string
// Accept newline after EOT
if posC == 0 && flags&NEWLINE_AFTER_EOT != 0 && buf[0] == '\n' && !init {
posC--
}
init = false
posC += offset
pos = append(pos, posC)
// Token is the start of a sentence
if sentB {
sentB = false
sent = append(sent, posC)
}
posC += len(buf) - offset
pos = append(pos, posC)
// Collect tokens also
if flags&TOKENS != 0 {
writer.WriteString(string(buf[offset:]))
writer.WriteByte('\n')
}
}
// Collect tokens
} else if flags&TOKENS != 0 {
tw.Token = func(offset int, buf []rune) {
writer.WriteString(string(buf[offset:]))
writer.WriteByte('\n')
}
// Ignore tokens
} else {
tw.Token = func(_ int, _ []rune) {}
}
// Collect sentence positions and maybe sentence boundaries
if flags&SENTENCE_POS != 0 {
tw.SentenceEnd = func(_ int) {
// Add end position of last token to sentence boundary
// TODO: This only works if token positions are taking into account
sent = append(sent, pos[len(pos)-1])
sentB = true
// Collect sentences also
if flags&SENTENCES != 0 {
writer.WriteByte('\n')
}
}
// Collect sentence boundaries
} else if flags&SENTENCES != 0 {
tw.SentenceEnd = func(_ int) {
writer.WriteByte('\n')
writer.Flush()
}
// Ignore sentence boundaries
} else {
tw.SentenceEnd = func(_ int) {}
}
// Write token or sentence positions
if flags&(TOKEN_POS|SENTENCE_POS) != 0 {
tw.TextEnd = func(_ int) {
// Write token positions
if flags&TOKEN_POS != 0 {
writer.WriteString(strconv.Itoa(pos[0]))
for _, x := range pos[1:] {
writer.WriteByte(' ')
writer.WriteString(strconv.Itoa(x))
}
writer.WriteByte('\n')
}
// Write sentence positions
if flags&SENTENCE_POS != 0 {
writer.WriteString(strconv.Itoa(sent[0]))
for _, x := range sent[1:] {
writer.WriteByte(' ')
writer.WriteString(strconv.Itoa(x))
}
writer.WriteByte('\n')
sent = sent[:0]
sentB = true
}
writer.Flush()
posC = 0
pos = pos[:0]
}
// Collect text ends
} else {
tw.TextEnd = func(_ int) {
writer.WriteByte('\n')
writer.Flush()
}
}
// Flush the writer
tw.Flush = func() error {
return writer.Flush()
}
return tw
}