Skip to content

Commit 048cc62

Browse files
committed
PARQUET-372: Do not write stats larger than 4k.
This updates the stats conversion to check whether the min and max values for page stats are larger than 4k. If so, no statistics for a page are written.
1 parent c381968 commit 048cc62

File tree

10 files changed

+228
-2
lines changed

10 files changed

+228
-2
lines changed

parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,11 @@ public byte[] getMinBytes() {
6767
return min == null ? null : min.getBytes();
6868
}
6969

70+
@Override
71+
public boolean isSmallerThan(long size) {
72+
return !hasNonNullValue() || ((min.length() + max.length()) < size);
73+
}
74+
7075
@Override
7176
public String toString() {
7277
if (this.hasNonNullValue())
@@ -77,11 +82,19 @@ else if (!this.isEmpty())
7782
return "no stats for this column";
7883
}
7984

85+
/**
86+
* @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0
87+
*/
88+
@Deprecated
8089
public void updateStats(Binary min_value, Binary max_value) {
8190
if (min.compareTo(min_value) > 0) { min = min_value.copy(); }
8291
if (max.compareTo(max_value) < 0) { max = max_value.copy(); }
8392
}
8493

94+
/**
95+
* @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0
96+
*/
97+
@Deprecated
8598
public void initializeStats(Binary min_value, Binary max_value) {
8699
min = min_value.copy();
87100
max = max_value.copy();
@@ -98,14 +111,26 @@ public Binary genericGetMax() {
98111
return max;
99112
}
100113

114+
/**
115+
* @deprecated use {@link #genericGetMax()}, will be removed in 2.0.0
116+
*/
117+
@Deprecated
101118
public Binary getMax() {
102119
return max;
103120
}
104121

122+
/**
123+
* @deprecated use {@link #genericGetMin()}, will be removed in 2.0.0
124+
*/
125+
@Deprecated
105126
public Binary getMin() {
106127
return min;
107128
}
108129

130+
/**
131+
* @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0
132+
*/
133+
@Deprecated
109134
public void setMinMax(Binary min, Binary max) {
110135
this.max = max;
111136
this.min = min;

parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,11 @@ public byte[] getMinBytes() {
6161
return BytesUtils.booleanToBytes(min);
6262
}
6363

64+
@Override
65+
public boolean isSmallerThan(long size) {
66+
return !hasNonNullValue() || (2 < size);
67+
}
68+
6469
@Override
6570
public String toString() {
6671
if (this.hasNonNullValue())

parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,11 @@ public byte[] getMinBytes() {
6161
return BytesUtils.longToBytes(Double.doubleToLongBits(min));
6262
}
6363

64+
@Override
65+
public boolean isSmallerThan(long size) {
66+
return !hasNonNullValue() || (16 < size);
67+
}
68+
6469
@Override
6570
public String toString() {
6671
if(this.hasNonNullValue())

parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,11 @@ public byte[] getMinBytes() {
6161
return BytesUtils.intToBytes(Float.floatToIntBits(min));
6262
}
6363

64+
@Override
65+
public boolean isSmallerThan(long size) {
66+
return !hasNonNullValue() || (8 < size);
67+
}
68+
6469
@Override
6570
public String toString() {
6671
if (this.hasNonNullValue())

parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,11 @@ public byte[] getMinBytes() {
6161
return BytesUtils.intToBytes(min);
6262
}
6363

64+
@Override
65+
public boolean isSmallerThan(long size) {
66+
return !hasNonNullValue() || (8 < size);
67+
}
68+
6469
@Override
6570
public String toString() {
6671
if (this.hasNonNullValue())

parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,11 @@ public byte[] getMinBytes() {
6161
return BytesUtils.longToBytes(min);
6262
}
6363

64+
@Override
65+
public boolean isSmallerThan(long size) {
66+
return !hasNonNullValue() || (16 < size);
67+
}
68+
6469
@Override
6570
public String toString() {
6671
if (this.hasNonNullValue())

parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,14 @@ public void mergeStatistics(Statistics stats) {
190190
*/
191191
abstract public byte[] getMinBytes();
192192

193+
/**
194+
* Abstract method to return whether the min and max values fit in the given
195+
* size.
196+
* @param size a size in bytes
197+
* @return true iff the min and max values are less than size bytes
198+
*/
199+
abstract public boolean isSmallerThan(long size);
200+
193201
/**
194202
* toString() to display min, max, num_nulls in a string
195203
*/

parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ public class ParquetMetadataConverter {
7575

7676
public static final MetadataFilter NO_FILTER = new NoFilter();
7777
public static final MetadataFilter SKIP_ROW_GROUPS = new SkipMetadataFilter();
78+
public static final long MAX_STATS_SIZE = 4096; // limit stats to 4k
7879

7980
private static final Log LOG = Log.getLog(ParquetMetadataConverter.class);
8081

@@ -235,7 +236,7 @@ public Encoding getEncoding(org.apache.parquet.column.Encoding encoding) {
235236
public static Statistics toParquetStatistics(
236237
org.apache.parquet.column.statistics.Statistics statistics) {
237238
Statistics stats = new Statistics();
238-
if (!statistics.isEmpty()) {
239+
if (!statistics.isEmpty() && statistics.isSmallerThan(MAX_STATS_SIZE)) {
239240
stats.setNull_count(statistics.getNumNulls());
240241
if (statistics.hasNonNullValue()) {
241242
stats.setMax(statistics.getMaxBytes());
@@ -244,6 +245,7 @@ public static Statistics toParquetStatistics(
244245
}
245246
return stats;
246247
}
248+
247249
/**
248250
* @deprecated Replaced by {@link #fromParquetStatistics(
249251
* String createdBy, Statistics statistics, PrimitiveTypeName type)}

parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java

Lines changed: 158 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,21 @@
4343
import java.util.Set;
4444
import java.util.TreeSet;
4545

46+
import org.apache.parquet.Version;
47+
import org.apache.parquet.bytes.BytesUtils;
4648
import org.apache.parquet.column.statistics.BinaryStatistics;
49+
import org.apache.parquet.column.statistics.BooleanStatistics;
50+
import org.apache.parquet.column.statistics.DoubleStatistics;
51+
import org.apache.parquet.column.statistics.FloatStatistics;
52+
import org.apache.parquet.column.statistics.IntStatistics;
53+
import org.apache.parquet.column.statistics.LongStatistics;
54+
import org.apache.parquet.column.statistics.Statistics;
4755
import org.apache.parquet.hadoop.metadata.BlockMetaData;
4856
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
4957
import org.apache.parquet.hadoop.metadata.ColumnPath;
5058
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
5159
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
60+
import org.apache.parquet.io.api.Binary;
5261
import org.junit.Assert;
5362
import org.junit.Test;
5463

@@ -320,5 +329,153 @@ public void testEncodingsCache() {
320329
assertEquals("java.util.Collections$UnmodifiableSet", res1.getClass().getName());
321330
assertEquals("java.util.Collections$UnmodifiableSet", res2.getClass().getName());
322331
assertEquals("java.util.Collections$UnmodifiableSet", res3.getClass().getName());
323-
}
332+
}
333+
334+
@Test
335+
public void testBinaryStats() {
336+
// make fake stats and verify the size check
337+
BinaryStatistics stats = new BinaryStatistics();
338+
stats.incrementNumNulls(3004);
339+
byte[] min = new byte[904];
340+
byte[] max = new byte[2388];
341+
stats.updateStats(Binary.fromConstantByteArray(min));
342+
stats.updateStats(Binary.fromConstantByteArray(max));
343+
long totalLen = min.length + max.length;
344+
Assert.assertFalse("Should not be smaller than min + max size",
345+
stats.isSmallerThan(totalLen));
346+
Assert.assertTrue("Should be smaller than min + max size + 1",
347+
stats.isSmallerThan(totalLen + 1));
348+
349+
org.apache.parquet.format.Statistics formatStats =
350+
ParquetMetadataConverter.toParquetStatistics(stats);
351+
352+
Assert.assertArrayEquals("Min should match", min, formatStats.getMin());
353+
Assert.assertArrayEquals("Max should match", max, formatStats.getMax());
354+
Assert.assertEquals("Num nulls should match",
355+
3004, formatStats.getNull_count());
356+
357+
// convert to empty stats because the values are too large
358+
stats.setMinMaxFromBytes(max, max);
359+
360+
formatStats = ParquetMetadataConverter.toParquetStatistics(stats);
361+
362+
Assert.assertFalse("Min should not be set", formatStats.isSetMin());
363+
Assert.assertFalse("Max should not be set", formatStats.isSetMax());
364+
Assert.assertFalse("Num nulls should not be set",
365+
formatStats.isSetNull_count());
366+
367+
Statistics roundTripStats = ParquetMetadataConverter.fromParquetStatistics(
368+
Version.FULL_VERSION, formatStats, PrimitiveTypeName.BINARY);
369+
370+
Assert.assertTrue(roundTripStats.isEmpty());
371+
}
372+
373+
@Test
374+
public void testIntegerStats() {
375+
// make fake stats and verify the size check
376+
IntStatistics stats = new IntStatistics();
377+
stats.incrementNumNulls(3004);
378+
int min = Integer.MIN_VALUE;
379+
int max = Integer.MAX_VALUE;
380+
stats.updateStats(min);
381+
stats.updateStats(max);
382+
383+
org.apache.parquet.format.Statistics formatStats =
384+
ParquetMetadataConverter.toParquetStatistics(stats);
385+
386+
Assert.assertEquals("Min should match",
387+
min, BytesUtils.bytesToInt(formatStats.getMin()));
388+
Assert.assertEquals("Max should match",
389+
max, BytesUtils.bytesToInt(formatStats.getMax()));
390+
Assert.assertEquals("Num nulls should match",
391+
3004, formatStats.getNull_count());
392+
}
393+
394+
@Test
395+
public void testLongStats() {
396+
// make fake stats and verify the size check
397+
LongStatistics stats = new LongStatistics();
398+
stats.incrementNumNulls(3004);
399+
long min = Long.MIN_VALUE;
400+
long max = Long.MAX_VALUE;
401+
stats.updateStats(min);
402+
stats.updateStats(max);
403+
404+
org.apache.parquet.format.Statistics formatStats =
405+
ParquetMetadataConverter.toParquetStatistics(stats);
406+
407+
Assert.assertEquals("Min should match",
408+
min, BytesUtils.bytesToLong(formatStats.getMin()));
409+
Assert.assertEquals("Max should match",
410+
max, BytesUtils.bytesToLong(formatStats.getMax()));
411+
Assert.assertEquals("Num nulls should match",
412+
3004, formatStats.getNull_count());
413+
}
414+
415+
@Test
416+
public void testFloatStats() {
417+
// make fake stats and verify the size check
418+
FloatStatistics stats = new FloatStatistics();
419+
stats.incrementNumNulls(3004);
420+
float min = Float.MIN_VALUE;
421+
float max = Float.MAX_VALUE;
422+
stats.updateStats(min);
423+
stats.updateStats(max);
424+
425+
org.apache.parquet.format.Statistics formatStats =
426+
ParquetMetadataConverter.toParquetStatistics(stats);
427+
428+
Assert.assertEquals("Min should match",
429+
min, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMin())),
430+
0.000001);
431+
Assert.assertEquals("Max should match",
432+
max, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMax())),
433+
0.000001);
434+
Assert.assertEquals("Num nulls should match",
435+
3004, formatStats.getNull_count());
436+
}
437+
438+
@Test
439+
public void testDoubleStats() {
440+
// make fake stats and verify the size check
441+
DoubleStatistics stats = new DoubleStatistics();
442+
stats.incrementNumNulls(3004);
443+
double min = Double.MIN_VALUE;
444+
double max = Double.MAX_VALUE;
445+
stats.updateStats(min);
446+
stats.updateStats(max);
447+
448+
org.apache.parquet.format.Statistics formatStats =
449+
ParquetMetadataConverter.toParquetStatistics(stats);
450+
451+
Assert.assertEquals("Min should match",
452+
min, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMin())),
453+
0.000001);
454+
Assert.assertEquals("Max should match",
455+
max, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMax())),
456+
0.000001);
457+
Assert.assertEquals("Num nulls should match",
458+
3004, formatStats.getNull_count());
459+
}
460+
461+
@Test
462+
public void testBooleanStats() {
463+
// make fake stats and verify the size check
464+
BooleanStatistics stats = new BooleanStatistics();
465+
stats.incrementNumNulls(3004);
466+
boolean min = Boolean.FALSE;
467+
boolean max = Boolean.TRUE;
468+
stats.updateStats(min);
469+
stats.updateStats(max);
470+
471+
org.apache.parquet.format.Statistics formatStats =
472+
ParquetMetadataConverter.toParquetStatistics(stats);
473+
474+
Assert.assertEquals("Min should match",
475+
min, BytesUtils.bytesToBool(formatStats.getMin()));
476+
Assert.assertEquals("Max should match",
477+
max, BytesUtils.bytesToBool(formatStats.getMax()));
478+
Assert.assertEquals("Num nulls should match",
479+
3004, formatStats.getNull_count());
480+
}
324481
}

parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,15 @@ private void validateStatsForPage(DataPage page, DictionaryPage dict, ColumnDesc
282282
PrimitiveConverter converter = getValidatingConverter(page, desc.getType());
283283
Statistics stats = getStatisticsFromPageHeader(page);
284284

285+
if (stats.isEmpty()) {
286+
// stats are empty if num nulls = 0 and there are no non-null values
287+
// this happens if stats are not written (e.g., when stats are too big)
288+
System.err.println(String.format(
289+
"No stats written for page=%s col=%s",
290+
page, Arrays.toString(desc.getPath())));
291+
return;
292+
}
293+
285294
long numNulls = 0;
286295
ColumnReaderImpl column = new ColumnReaderImpl(desc, reader, converter, null);
287296
for (int i = 0; i < reader.getTotalValueCount(); i += 1) {

0 commit comments

Comments
 (0)