forked from segmentio/parquet-go
-
Notifications
You must be signed in to change notification settings - Fork 0
/
encoding.go
143 lines (124 loc) · 3.95 KB
/
encoding.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
package parquet
import (
"math/bits"
"github.com/segmentio/parquet-go/encoding"
"github.com/segmentio/parquet-go/encoding/bitpacked"
"github.com/segmentio/parquet-go/encoding/bytestreamsplit"
"github.com/segmentio/parquet-go/encoding/delta"
"github.com/segmentio/parquet-go/encoding/plain"
"github.com/segmentio/parquet-go/encoding/rle"
"github.com/segmentio/parquet-go/format"
)
var (
// Plain is the default parquet encoding.
Plain plain.Encoding
// RLE is the hybrid bit-pack/run-length parquet encoding.
RLE rle.Encoding
// BitPacked is the deprecated bit-packed encoding for repetition and
// definition levels.
BitPacked bitpacked.Encoding
// PlainDictionary is the plain dictionary parquet encoding.
//
// This encoding should not be used anymore in parquet 2.0 and later,
// it is implemented for backwards compatibility to support reading
// files that were encoded with older parquet libraries.
PlainDictionary plain.DictionaryEncoding
// RLEDictionary is the RLE dictionary parquet encoding.
RLEDictionary rle.DictionaryEncoding
// DeltaBinaryPacked is the delta binary packed parquet encoding.
DeltaBinaryPacked delta.BinaryPackedEncoding
// DeltaLengthByteArray is the delta length byte array parquet encoding.
DeltaLengthByteArray delta.LengthByteArrayEncoding
// DeltaByteArray is the delta byte array parquet encoding.
DeltaByteArray delta.ByteArrayEncoding
// ByteStreamSplit is an encoding for floating-point data.
ByteStreamSplit bytestreamsplit.Encoding
// Table indexing the encodings supported by this package.
encodings = [...]encoding.Encoding{
format.Plain: &Plain,
format.PlainDictionary: &PlainDictionary,
format.BitPacked: &BitPacked,
format.RLE: &RLE,
format.RLEDictionary: &RLEDictionary,
format.DeltaBinaryPacked: &DeltaBinaryPacked,
format.DeltaLengthByteArray: &DeltaLengthByteArray,
format.DeltaByteArray: &DeltaByteArray,
format.ByteStreamSplit: &ByteStreamSplit,
}
// Table indexing RLE encodings for repetition and definition levels of
// all supported bit widths.
levelEncodingsRLE = [...]rle.Encoding{
0: {BitWidth: 1},
1: {BitWidth: 2},
2: {BitWidth: 3},
3: {BitWidth: 4},
4: {BitWidth: 5},
5: {BitWidth: 6},
6: {BitWidth: 7},
7: {BitWidth: 8},
}
levelEncodingsBitPacked = [...]bitpacked.Encoding{
0: {BitWidth: 1},
1: {BitWidth: 2},
2: {BitWidth: 3},
3: {BitWidth: 4},
4: {BitWidth: 5},
5: {BitWidth: 6},
6: {BitWidth: 7},
7: {BitWidth: 8},
}
)
func isDictionaryEncoding(encoding encoding.Encoding) bool {
return isDictionaryFormat(encoding.Encoding())
}
func isDictionaryFormat(encoding format.Encoding) bool {
return encoding == format.PlainDictionary || encoding == format.RLEDictionary
}
// LookupEncoding returns the parquet encoding associated with the given code.
//
// The function never returns nil. If the encoding is not supported,
// encoding.NotSupported is returned.
func LookupEncoding(enc format.Encoding) encoding.Encoding {
if enc >= 0 && int(enc) < len(encodings) {
if e := encodings[enc]; e != nil {
return e
}
}
return encoding.NotSupported{}
}
func lookupLevelEncoding(enc format.Encoding, max byte) encoding.Encoding {
i := bits.Len8(max) - 1
switch enc {
case format.RLE:
return &levelEncodingsRLE[i]
case format.BitPacked:
return &levelEncodingsBitPacked[i]
default:
return encoding.NotSupported{}
}
}
func canEncode(e encoding.Encoding, k Kind) bool {
if isDictionaryEncoding(e) {
return true
}
switch k {
case Boolean:
return encoding.CanEncodeBoolean(e)
case Int32:
return encoding.CanEncodeInt32(e)
case Int64:
return encoding.CanEncodeInt64(e)
case Int96:
return encoding.CanEncodeInt96(e)
case Float:
return encoding.CanEncodeFloat(e)
case Double:
return encoding.CanEncodeDouble(e)
case ByteArray:
return encoding.CanEncodeByteArray(e)
case FixedLenByteArray:
return encoding.CanEncodeFixedLenByteArray(e)
default:
return false
}
}