Skip to content

Commit fbbc1c4

Browse files
committed
PARQUET-372: Do not write stats larger than 4k.
This updates the stats conversion to check whether the min and max values for page stats are larger than 4k. If so, no statistics for a page are written.
1 parent 39a3cd0 commit fbbc1c4

File tree

10 files changed

+228
-2
lines changed

10 files changed

+228
-2
lines changed

parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,11 @@ public byte[] getMinBytes() {
6767
return min == null ? null : min.getBytes();
6868
}
6969

70+
@Override
71+
public boolean isSmallerThan(long size) {
72+
return !hasNonNullValue() || ((min.length() + max.length()) < size);
73+
}
74+
7075
@Override
7176
public String toString() {
7277
if (this.hasNonNullValue())
@@ -77,11 +82,19 @@ else if (!this.isEmpty())
7782
return "no stats for this column";
7883
}
7984

85+
/**
86+
* @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0
87+
*/
88+
@Deprecated
8089
public void updateStats(Binary min_value, Binary max_value) {
8190
if (min.compareTo(min_value) > 0) { min = min_value.copy(); }
8291
if (max.compareTo(max_value) < 0) { max = max_value.copy(); }
8392
}
8493

94+
/**
95+
* @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0
96+
*/
97+
@Deprecated
8598
public void initializeStats(Binary min_value, Binary max_value) {
8699
min = min_value.copy();
87100
max = max_value.copy();
@@ -98,14 +111,26 @@ public Binary genericGetMax() {
98111
return max;
99112
}
100113

114+
/**
115+
* @deprecated use {@link #genericGetMax()}, will be removed in 2.0.0
116+
*/
117+
@Deprecated
101118
public Binary getMax() {
102119
return max;
103120
}
104121

122+
/**
123+
* @deprecated use {@link #genericGetMin()}, will be removed in 2.0.0
124+
*/
125+
@Deprecated
105126
public Binary getMin() {
106127
return min;
107128
}
108129

130+
/**
131+
* @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0
132+
*/
133+
@Deprecated
109134
public void setMinMax(Binary min, Binary max) {
110135
this.max = max;
111136
this.min = min;

parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,11 @@ public byte[] getMinBytes() {
6161
return BytesUtils.booleanToBytes(min);
6262
}
6363

64+
@Override
65+
public boolean isSmallerThan(long size) {
66+
return !hasNonNullValue() || (2 < size);
67+
}
68+
6469
@Override
6570
public String toString() {
6671
if (this.hasNonNullValue())

parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,11 @@ public byte[] getMinBytes() {
6161
return BytesUtils.longToBytes(Double.doubleToLongBits(min));
6262
}
6363

64+
@Override
65+
public boolean isSmallerThan(long size) {
66+
return !hasNonNullValue() || (16 < size);
67+
}
68+
6469
@Override
6570
public String toString() {
6671
if(this.hasNonNullValue())

parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,11 @@ public byte[] getMinBytes() {
6161
return BytesUtils.intToBytes(Float.floatToIntBits(min));
6262
}
6363

64+
@Override
65+
public boolean isSmallerThan(long size) {
66+
return !hasNonNullValue() || (8 < size);
67+
}
68+
6469
@Override
6570
public String toString() {
6671
if (this.hasNonNullValue())

parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,11 @@ public byte[] getMinBytes() {
6161
return BytesUtils.intToBytes(min);
6262
}
6363

64+
@Override
65+
public boolean isSmallerThan(long size) {
66+
return !hasNonNullValue() || (8 < size);
67+
}
68+
6469
@Override
6570
public String toString() {
6671
if (this.hasNonNullValue())

parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,11 @@ public byte[] getMinBytes() {
6161
return BytesUtils.longToBytes(min);
6262
}
6363

64+
@Override
65+
public boolean isSmallerThan(long size) {
66+
return !hasNonNullValue() || (16 < size);
67+
}
68+
6469
@Override
6570
public String toString() {
6671
if (this.hasNonNullValue())

parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,14 @@ public void mergeStatistics(Statistics stats) {
190190
*/
191191
abstract public byte[] getMinBytes();
192192

193+
/**
194+
* Abstract method to return whether the min and max values fit in the given
195+
* size.
196+
* @param size a size in bytes
197+
* @return true iff the min and max values are less than size bytes
198+
*/
199+
abstract public boolean isSmallerThan(long size);
200+
193201
/**
194202
* toString() to display min, max, num_nulls in a string
195203
*/

parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ public class ParquetMetadataConverter {
7777

7878
public static final MetadataFilter NO_FILTER = new NoFilter();
7979
public static final MetadataFilter SKIP_ROW_GROUPS = new SkipMetadataFilter();
80+
public static final long MAX_STATS_SIZE = 4096; // limit stats to 4k
8081

8182
private static final Log LOG = Log.getLog(ParquetMetadataConverter.class);
8283

@@ -284,7 +285,7 @@ dataPageType, getEncoding(encoding),
284285
public static Statistics toParquetStatistics(
285286
org.apache.parquet.column.statistics.Statistics statistics) {
286287
Statistics stats = new Statistics();
287-
if (!statistics.isEmpty()) {
288+
if (!statistics.isEmpty() && statistics.isSmallerThan(MAX_STATS_SIZE)) {
288289
stats.setNull_count(statistics.getNumNulls());
289290
if (statistics.hasNonNullValue()) {
290291
stats.setMax(statistics.getMaxBytes());
@@ -293,6 +294,7 @@ public static Statistics toParquetStatistics(
293294
}
294295
return stats;
295296
}
297+
296298
/**
297299
* @deprecated Replaced by {@link #fromParquetStatistics(
298300
* String createdBy, Statistics statistics, PrimitiveTypeName type)}

parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java

Lines changed: 158 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,21 @@
4545
import java.util.TreeSet;
4646

4747
import com.google.common.collect.Sets;
48+
import org.apache.parquet.Version;
49+
import org.apache.parquet.bytes.BytesUtils;
4850
import org.apache.parquet.column.statistics.BinaryStatistics;
51+
import org.apache.parquet.column.statistics.BooleanStatistics;
52+
import org.apache.parquet.column.statistics.DoubleStatistics;
53+
import org.apache.parquet.column.statistics.FloatStatistics;
54+
import org.apache.parquet.column.statistics.IntStatistics;
55+
import org.apache.parquet.column.statistics.LongStatistics;
56+
import org.apache.parquet.column.statistics.Statistics;
4957
import org.apache.parquet.hadoop.metadata.BlockMetaData;
5058
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
5159
import org.apache.parquet.hadoop.metadata.ColumnPath;
5260
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
5361
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
62+
import org.apache.parquet.io.api.Binary;
5463
import org.junit.Assert;
5564
import org.junit.Test;
5665

@@ -357,5 +366,153 @@ public void testEncodingsCache() {
357366
assertEquals("java.util.Collections$UnmodifiableSet", res1.getClass().getName());
358367
assertEquals("java.util.Collections$UnmodifiableSet", res2.getClass().getName());
359368
assertEquals("java.util.Collections$UnmodifiableSet", res3.getClass().getName());
360-
}
369+
}
370+
371+
@Test
372+
public void testBinaryStats() {
373+
// make fake stats and verify the size check
374+
BinaryStatistics stats = new BinaryStatistics();
375+
stats.incrementNumNulls(3004);
376+
byte[] min = new byte[904];
377+
byte[] max = new byte[2388];
378+
stats.updateStats(Binary.fromConstantByteArray(min));
379+
stats.updateStats(Binary.fromConstantByteArray(max));
380+
long totalLen = min.length + max.length;
381+
Assert.assertFalse("Should not be smaller than min + max size",
382+
stats.isSmallerThan(totalLen));
383+
Assert.assertTrue("Should be smaller than min + max size + 1",
384+
stats.isSmallerThan(totalLen + 1));
385+
386+
org.apache.parquet.format.Statistics formatStats =
387+
ParquetMetadataConverter.toParquetStatistics(stats);
388+
389+
Assert.assertArrayEquals("Min should match", min, formatStats.getMin());
390+
Assert.assertArrayEquals("Max should match", max, formatStats.getMax());
391+
Assert.assertEquals("Num nulls should match",
392+
3004, formatStats.getNull_count());
393+
394+
// convert to empty stats because the values are too large
395+
stats.setMinMaxFromBytes(max, max);
396+
397+
formatStats = ParquetMetadataConverter.toParquetStatistics(stats);
398+
399+
Assert.assertFalse("Min should not be set", formatStats.isSetMin());
400+
Assert.assertFalse("Max should not be set", formatStats.isSetMax());
401+
Assert.assertFalse("Num nulls should not be set",
402+
formatStats.isSetNull_count());
403+
404+
Statistics roundTripStats = ParquetMetadataConverter.fromParquetStatistics(
405+
Version.FULL_VERSION, formatStats, PrimitiveTypeName.BINARY);
406+
407+
Assert.assertTrue(roundTripStats.isEmpty());
408+
}
409+
410+
@Test
411+
public void testIntegerStats() {
412+
// make fake stats and verify the size check
413+
IntStatistics stats = new IntStatistics();
414+
stats.incrementNumNulls(3004);
415+
int min = Integer.MIN_VALUE;
416+
int max = Integer.MAX_VALUE;
417+
stats.updateStats(min);
418+
stats.updateStats(max);
419+
420+
org.apache.parquet.format.Statistics formatStats =
421+
ParquetMetadataConverter.toParquetStatistics(stats);
422+
423+
Assert.assertEquals("Min should match",
424+
min, BytesUtils.bytesToInt(formatStats.getMin()));
425+
Assert.assertEquals("Max should match",
426+
max, BytesUtils.bytesToInt(formatStats.getMax()));
427+
Assert.assertEquals("Num nulls should match",
428+
3004, formatStats.getNull_count());
429+
}
430+
431+
@Test
432+
public void testLongStats() {
433+
// make fake stats and verify the size check
434+
LongStatistics stats = new LongStatistics();
435+
stats.incrementNumNulls(3004);
436+
long min = Long.MIN_VALUE;
437+
long max = Long.MAX_VALUE;
438+
stats.updateStats(min);
439+
stats.updateStats(max);
440+
441+
org.apache.parquet.format.Statistics formatStats =
442+
ParquetMetadataConverter.toParquetStatistics(stats);
443+
444+
Assert.assertEquals("Min should match",
445+
min, BytesUtils.bytesToLong(formatStats.getMin()));
446+
Assert.assertEquals("Max should match",
447+
max, BytesUtils.bytesToLong(formatStats.getMax()));
448+
Assert.assertEquals("Num nulls should match",
449+
3004, formatStats.getNull_count());
450+
}
451+
452+
@Test
453+
public void testFloatStats() {
454+
// make fake stats and verify the size check
455+
FloatStatistics stats = new FloatStatistics();
456+
stats.incrementNumNulls(3004);
457+
float min = Float.MIN_VALUE;
458+
float max = Float.MAX_VALUE;
459+
stats.updateStats(min);
460+
stats.updateStats(max);
461+
462+
org.apache.parquet.format.Statistics formatStats =
463+
ParquetMetadataConverter.toParquetStatistics(stats);
464+
465+
Assert.assertEquals("Min should match",
466+
min, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMin())),
467+
0.000001);
468+
Assert.assertEquals("Max should match",
469+
max, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMax())),
470+
0.000001);
471+
Assert.assertEquals("Num nulls should match",
472+
3004, formatStats.getNull_count());
473+
}
474+
475+
@Test
476+
public void testDoubleStats() {
477+
// make fake stats and verify the size check
478+
DoubleStatistics stats = new DoubleStatistics();
479+
stats.incrementNumNulls(3004);
480+
double min = Double.MIN_VALUE;
481+
double max = Double.MAX_VALUE;
482+
stats.updateStats(min);
483+
stats.updateStats(max);
484+
485+
org.apache.parquet.format.Statistics formatStats =
486+
ParquetMetadataConverter.toParquetStatistics(stats);
487+
488+
Assert.assertEquals("Min should match",
489+
min, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMin())),
490+
0.000001);
491+
Assert.assertEquals("Max should match",
492+
max, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMax())),
493+
0.000001);
494+
Assert.assertEquals("Num nulls should match",
495+
3004, formatStats.getNull_count());
496+
}
497+
498+
@Test
499+
public void testBooleanStats() {
500+
// make fake stats and verify the size check
501+
BooleanStatistics stats = new BooleanStatistics();
502+
stats.incrementNumNulls(3004);
503+
boolean min = Boolean.FALSE;
504+
boolean max = Boolean.TRUE;
505+
stats.updateStats(min);
506+
stats.updateStats(max);
507+
508+
org.apache.parquet.format.Statistics formatStats =
509+
ParquetMetadataConverter.toParquetStatistics(stats);
510+
511+
Assert.assertEquals("Min should match",
512+
min, BytesUtils.bytesToBool(formatStats.getMin()));
513+
Assert.assertEquals("Max should match",
514+
max, BytesUtils.bytesToBool(formatStats.getMax()));
515+
Assert.assertEquals("Num nulls should match",
516+
3004, formatStats.getNull_count());
517+
}
361518
}

parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,15 @@ private void validateStatsForPage(DataPage page, DictionaryPage dict, ColumnDesc
282282
PrimitiveConverter converter = getValidatingConverter(page, desc.getType());
283283
Statistics stats = getStatisticsFromPageHeader(page);
284284

285+
if (stats.isEmpty()) {
286+
// stats are empty if num nulls = 0 and there are no non-null values
287+
// this happens if stats are not written (e.g., when stats are too big)
288+
System.err.println(String.format(
289+
"No stats written for page=%s col=%s",
290+
page, Arrays.toString(desc.getPath())));
291+
return;
292+
}
293+
285294
long numNulls = 0;
286295
ColumnReaderImpl column = new ColumnReaderImpl(desc, reader, converter, null);
287296
for (int i = 0; i < reader.getTotalValueCount(); i += 1) {

0 commit comments

Comments
 (0)