@@ -70,6 +70,12 @@ public class ParquetProperties {
70
70
71
71
private static final int MIN_SLAB_SIZE = 64 ;
72
72
73
+ private enum ByteStreamSplitMode {
74
+ NONE ,
75
+ FLOATING_POINT ,
76
+ EXTENDED
77
+ }
78
+
73
79
public enum WriterVersion {
74
80
PARQUET_1_0 ("v1" ),
75
81
PARQUET_2_0 ("v2" );
@@ -114,7 +120,7 @@ public static WriterVersion fromString(String name) {
114
120
private final ColumnProperty <Integer > numBloomFilterCandidates ;
115
121
private final int pageRowCountLimit ;
116
122
private final boolean pageWriteChecksumEnabled ;
117
- private final boolean enableByteStreamSplit ;
123
+ private final ColumnProperty < ByteStreamSplitMode > byteStreamSplitEnabled ;
118
124
private final Map <String , String > extraMetaData ;
119
125
120
126
private ParquetProperties (Builder builder ) {
@@ -141,10 +147,18 @@ private ParquetProperties(Builder builder) {
141
147
this .numBloomFilterCandidates = builder .numBloomFilterCandidates .build ();
142
148
this .pageRowCountLimit = builder .pageRowCountLimit ;
143
149
this .pageWriteChecksumEnabled = builder .pageWriteChecksumEnabled ;
144
- this .enableByteStreamSplit = builder .enableByteStreamSplit ;
150
+ this .byteStreamSplitEnabled = builder .byteStreamSplitEnabled . build () ;
145
151
this .extraMetaData = builder .extraMetaData ;
146
152
}
147
153
154
+ public static Builder builder () {
155
+ return new Builder ();
156
+ }
157
+
158
+ public static Builder copy (ParquetProperties toCopy ) {
159
+ return new Builder (toCopy );
160
+ }
161
+
148
162
public ValuesWriter newRepetitionLevelWriter (ColumnDescriptor path ) {
149
163
return newColumnDescriptorValuesWriter (path .getMaxRepetitionLevel ());
150
164
}
@@ -208,8 +222,23 @@ public boolean isDictionaryEnabled(ColumnDescriptor column) {
208
222
return dictionaryEnabled .getValue (column );
209
223
}
210
224
225
+ @ Deprecated ()
211
226
public boolean isByteStreamSplitEnabled () {
212
- return enableByteStreamSplit ;
227
+ return byteStreamSplitEnabled .getDefaultValue () != ByteStreamSplitMode .NONE ;
228
+ }
229
+
230
+ public boolean isByteStreamSplitEnabled (ColumnDescriptor column ) {
231
+ switch (column .getPrimitiveType ().getPrimitiveTypeName ()) {
232
+ case FLOAT :
233
+ case DOUBLE :
234
+ return byteStreamSplitEnabled .getValue (column ) != ByteStreamSplitMode .NONE ;
235
+ case INT32 :
236
+ case INT64 :
237
+ case FIXED_LEN_BYTE_ARRAY :
238
+ return byteStreamSplitEnabled .getValue (column ) == ByteStreamSplitMode .EXTENDED ;
239
+ default :
240
+ return false ;
241
+ }
213
242
}
214
243
215
244
public ByteBufferAllocator getAllocator () {
@@ -301,14 +330,6 @@ public Map<String, String> getExtraMetaData() {
301
330
return extraMetaData ;
302
331
}
303
332
304
- public static Builder builder () {
305
- return new Builder ();
306
- }
307
-
308
- public static Builder copy (ParquetProperties toCopy ) {
309
- return new Builder (toCopy );
310
- }
311
-
312
333
@ Override
313
334
public String toString () {
314
335
return "Parquet page size to " + getPageSizeThreshold () + '\n'
@@ -349,11 +370,16 @@ public static class Builder {
349
370
private final ColumnProperty .Builder <Boolean > bloomFilterEnabled ;
350
371
private int pageRowCountLimit = DEFAULT_PAGE_ROW_COUNT_LIMIT ;
351
372
private boolean pageWriteChecksumEnabled = DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED ;
352
- private boolean enableByteStreamSplit = DEFAULT_IS_BYTE_STREAM_SPLIT_ENABLED ;
373
+ private final ColumnProperty . Builder < ByteStreamSplitMode > byteStreamSplitEnabled ;
353
374
private Map <String , String > extraMetaData = new HashMap <>();
354
375
355
376
private Builder () {
356
377
enableDict = ColumnProperty .<Boolean >builder ().withDefaultValue (DEFAULT_IS_DICTIONARY_ENABLED );
378
+ byteStreamSplitEnabled = ColumnProperty .<ByteStreamSplitMode >builder ()
379
+ .withDefaultValue (
380
+ DEFAULT_IS_BYTE_STREAM_SPLIT_ENABLED
381
+ ? ByteStreamSplitMode .FLOATING_POINT
382
+ : ByteStreamSplitMode .NONE );
357
383
bloomFilterEnabled = ColumnProperty .<Boolean >builder ().withDefaultValue (DEFAULT_BLOOM_FILTER_ENABLED );
358
384
bloomFilterNDVs = ColumnProperty .<Long >builder ().withDefaultValue (null );
359
385
bloomFilterFPPs = ColumnProperty .<Double >builder ().withDefaultValue (DEFAULT_BLOOM_FILTER_FPP );
@@ -365,7 +391,7 @@ private Builder() {
365
391
366
392
private Builder (ParquetProperties toCopy ) {
367
393
this .pageSize = toCopy .pageSizeThreshold ;
368
- this .enableDict = ColumnProperty .< Boolean > builder (toCopy .dictionaryEnabled );
394
+ this .enableDict = ColumnProperty .builder (toCopy .dictionaryEnabled );
369
395
this .dictPageSize = toCopy .dictionaryPageSizeThreshold ;
370
396
this .writerVersion = toCopy .writerVersion ;
371
397
this .minRowCountForPageSizeCheck = toCopy .minRowCountForPageSizeCheck ;
@@ -375,13 +401,13 @@ private Builder(ParquetProperties toCopy) {
375
401
this .allocator = toCopy .allocator ;
376
402
this .pageRowCountLimit = toCopy .pageRowCountLimit ;
377
403
this .pageWriteChecksumEnabled = toCopy .pageWriteChecksumEnabled ;
378
- this .bloomFilterNDVs = ColumnProperty .< Long > builder (toCopy .bloomFilterNDVs );
379
- this .bloomFilterFPPs = ColumnProperty .< Double > builder (toCopy .bloomFilterFPPs );
380
- this .bloomFilterEnabled = ColumnProperty .< Boolean > builder (toCopy .bloomFilterEnabled );
381
- this .adaptiveBloomFilterEnabled = ColumnProperty .< Boolean > builder (toCopy .adaptiveBloomFilterEnabled );
382
- this .numBloomFilterCandidates = ColumnProperty .< Integer > builder (toCopy .numBloomFilterCandidates );
404
+ this .bloomFilterNDVs = ColumnProperty .builder (toCopy .bloomFilterNDVs );
405
+ this .bloomFilterFPPs = ColumnProperty .builder (toCopy .bloomFilterFPPs );
406
+ this .bloomFilterEnabled = ColumnProperty .builder (toCopy .bloomFilterEnabled );
407
+ this .adaptiveBloomFilterEnabled = ColumnProperty .builder (toCopy .adaptiveBloomFilterEnabled );
408
+ this .numBloomFilterCandidates = ColumnProperty .builder (toCopy .numBloomFilterCandidates );
383
409
this .maxBloomFilterBytes = toCopy .maxBloomFilterBytes ;
384
- this .enableByteStreamSplit = toCopy .enableByteStreamSplit ;
410
+ this .byteStreamSplitEnabled = ColumnProperty . builder ( toCopy .byteStreamSplitEnabled ) ;
385
411
this .extraMetaData = toCopy .extraMetaData ;
386
412
}
387
413
@@ -420,8 +446,40 @@ public Builder withDictionaryEncoding(String columnPath, boolean enableDictionar
420
446
return this ;
421
447
}
422
448
423
- public Builder withByteStreamSplitEncoding (boolean enableByteStreamSplit ) {
424
- this .enableByteStreamSplit = enableByteStreamSplit ;
449
+ /**
450
+ * Enable or disable BYTE_STREAM_SPLIT encoding for FLOAT and DOUBLE columns.
451
+ *
452
+ * @param enable whether BYTE_STREAM_SPLIT encoding should be enabled
453
+ * @return this builder for method chaining.
454
+ */
455
+ public Builder withByteStreamSplitEncoding (boolean enable ) {
456
+ this .byteStreamSplitEnabled .withDefaultValue (
457
+ enable ? ByteStreamSplitMode .FLOATING_POINT : ByteStreamSplitMode .NONE );
458
+ return this ;
459
+ }
460
+
461
+ /**
462
+ * Enable or disable BYTE_STREAM_SPLIT encoding for specified columns.
463
+ *
464
+ * @param columnPath the path of the column (dot-string)
465
+ * @param enable whether BYTE_STREAM_SPLIT encoding should be enabled
466
+ * @return this builder for method chaining.
467
+ */
468
+ public Builder withByteStreamSplitEncoding (String columnPath , boolean enable ) {
469
+ this .byteStreamSplitEnabled .withValue (
470
+ columnPath , enable ? ByteStreamSplitMode .EXTENDED : ByteStreamSplitMode .NONE );
471
+ return this ;
472
+ }
473
+
474
+ /**
475
+ * Enable or disable BYTE_STREAM_SPLIT encoding for FLOAT, DOUBLE, INT32, INT64 and FIXED_LEN_BYTE_ARRAY columns.
476
+ *
477
+ * @param enable whether BYTE_STREAM_SPLIT encoding should be enabled
478
+ * @return this builder for method chaining.
479
+ */
480
+ public Builder withExtendedByteStreamSplitEncoding (boolean enable ) {
481
+ this .byteStreamSplitEnabled .withDefaultValue (
482
+ enable ? ByteStreamSplitMode .EXTENDED : ByteStreamSplitMode .NONE );
425
483
return this ;
426
484
}
427
485
0 commit comments