forked from colinmarc/sequencefile
-
Notifications
You must be signed in to change notification settings - Fork 0
/
reader.go
262 lines (225 loc) · 6 KB
/
reader.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
package sequencefile
import (
"bufio"
"bytes"
"compress/gzip"
"encoding/binary"
"errors"
"fmt"
"io"
"os"
)
// A Reader reads key/value pairs from a SequenceFile input stream.
//
// A reader is valid at any key or block offset; it's safe to start in the
// middle of a file or seek the underlying input stream if the location was
// recorded between calls to Scan, and as long as you call Reset after seeking.
// Note, however, that with a block-compressed file (Header.Compression set to
// BlockCompression), the position will be at the beginning of the block that
// holds the key, not right before the key itself.
type Reader struct {
Header Header
syncMarkerBytes []byte
reader io.Reader
closed bool
err error
compression Compression
codec CompressionCodec
decompressor decompressor
buf bytes.Buffer
block blockReader
key []byte
value []byte
}
// Open opens a SequenceFile on disk and immediately reads the header.
func Open(path string) (*Reader, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
r := NewReader(bufio.NewReader(f))
err = r.ReadHeader()
if err != nil {
return nil, err
}
return r, nil
}
// New returns a new Reader for a SequenceFile, reading data from r. If the
// io.Reader is positioned at the start of a file, you should immediately call
// ReadHeader to read through the header.
func NewReader(r io.Reader) *Reader {
return &Reader{reader: r}
}
// New returns a new Reader for a SequenceFile, reading data from r. Normally,
// compression options are inferred from the header of a file, but if the header
// is unavailable (because you're starting mid-stream) you can call this method
// with the compression options set explicitly.
func NewReaderCompression(r io.Reader, compression Compression, codec CompressionCodec) *Reader {
rd := NewReader(r)
rd.compression = compression
rd.codec = codec
return rd
}
// Scan advances the reader to the start of the next record, reading the key
// and value into memory. These can then be obtained by calling Key and Value.
// If the end of the file is reached, or there is an error, Scan will return
// false.
func (r *Reader) Scan() bool {
if r.compression == BlockCompression {
return r.scanBlock()
} else {
return r.scanRecord()
}
}
// Reset resets the internal state of the reader, but maintains compression
// settings and header information. You should call Reset if you seek the
// underlying reader, but should create an entirely new Reader if you are
// starting a different file.
func (r *Reader) Reset() {
r.clear()
r.block = blockReader{}
}
// Err returns the first non-EOF error reached while scanning.
func (r *Reader) Err() error {
return r.err
}
// Key returns the key for the current record. The byte slice will be reused
// after the next call to Scan.
func (r *Reader) Key() []byte {
return r.key
}
// Value returns the value for the current record. The byte slice will be
// reused after the next call to Scan.
func (r *Reader) Value() []byte {
return r.value
}
func (r *Reader) scanRecord() bool {
if r.closed {
return false
}
r.clear()
b, err := r.consume(4)
if err == io.EOF {
return false
} else if err != nil {
r.close(err)
return false
}
// Length -1 means a sync marker (the length is obnoxiously encoded as a cast
// uint32 just for this).
totalLength := int(int32(binary.BigEndian.Uint32(b)))
if totalLength == -1 {
err = r.checkSync()
if err != nil {
r.close(err)
return false
}
return r.scanRecord()
} else if totalLength < 8 {
r.close(fmt.Errorf("sequencefile: invalid record length: %d", totalLength))
return false
}
r.clear()
b, err = r.consume(4)
if err != nil {
r.close(err)
return false
}
keyLength := int(int32(binary.BigEndian.Uint32(b)))
valueLength := totalLength - keyLength
if keyLength < 4 {
r.close(fmt.Errorf("sequencefile: invalid key length: %d", keyLength))
return false
}
r.clear()
r.key, err = r.consume(keyLength)
if err != nil {
r.close(err)
return false
}
if r.compression == RecordCompression {
r.value, err = r.consumeCompressed(valueLength)
if err != nil {
r.close(err)
return false
}
} else {
r.value, err = r.consume(valueLength)
if err != nil {
r.close(err)
return false
}
}
return true
}
func (r *Reader) checkSync() error {
r.clear()
b, err := r.consume(SyncSize)
if err != nil {
return err
}
// If we never read the Header, infer the sync marker from the first time we
// see it.
if r.syncMarkerBytes == nil {
r.syncMarkerBytes = make([]byte, SyncSize)
copy(r.syncMarkerBytes, b)
} else if !bytes.Equal(b, r.syncMarkerBytes) {
return errors.New("sequencefile: invalid sync marker")
}
return nil
}
// consume reads some bytes off the input stream, and returns a bite slice that
// is only valid until the next call to clear.
func (r *Reader) consume(n int) ([]byte, error) {
off := r.buf.Len()
_, err := io.CopyN(&r.buf, r.reader, int64(n))
if err != nil {
return nil, err
}
return r.buf.Bytes()[off:r.buf.Len()], nil
}
func (r *Reader) consumeCompressed(n int) ([]byte, error) {
lr := &io.LimitedReader{R: r.reader, N: int64(n)}
d, err := r.getDecompressor(lr)
if err != nil {
return nil, err
}
off := r.buf.Len()
r.buf.Grow(n)
_, err = r.buf.ReadFrom(d)
if err != nil {
return nil, err
} else if lr.N > 0 {
return nil, io.ErrUnexpectedEOF
}
return r.buf.Bytes()[off:r.buf.Len()], nil
}
func (r *Reader) clear() {
r.buf.Reset()
}
func (r *Reader) getDecompressor(src io.Reader) (io.Reader, error) {
var err error
if r.decompressor != nil {
err = r.decompressor.Reset(src)
} else {
switch r.codec {
case GzipCompression:
r.decompressor, err = gzip.NewReader(src)
case SnappyCompression:
r.decompressor, err = newSnappyFrameReader(src)
case ZlibCompression:
r.decompressor, err = newZlibReaderWrapper(src)
default:
panic("compression set without codec")
}
}
return r.decompressor, err
}
func (r *Reader) close(err error) {
r.closed = true
r.err = err
r.clear()
if r.decompressor != nil {
r.decompressor.Close()
}
}