From fbbc1c440d042569ec28d20d3bcd270dd7f18a07 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Wed, 23 Sep 2015 09:45:02 -0700 Subject: [PATCH 1/2] PARQUET-372: Do not write stats larger than 4k. This updates the stats conversion to check whether the min and max values for page stats are larger than 4k. If so, no statistics for a page are written. --- .../column/statistics/BinaryStatistics.java | 25 +++ .../column/statistics/BooleanStatistics.java | 5 + .../column/statistics/DoubleStatistics.java | 5 + .../column/statistics/FloatStatistics.java | 5 + .../column/statistics/IntStatistics.java | 5 + .../column/statistics/LongStatistics.java | 5 + .../parquet/column/statistics/Statistics.java | 8 + .../converter/ParquetMetadataConverter.java | 4 +- .../TestParquetMetadataConverter.java | 159 +++++++++++++++++- .../parquet/statistics/TestStatistics.java | 9 + 10 files changed, 228 insertions(+), 2 deletions(-) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java index e8439f0027..c319b4adb0 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java @@ -67,6 +67,11 @@ public byte[] getMinBytes() { return min == null ? null : min.getBytes(); } + @Override + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || ((min.length() + max.length()) < size); + } + @Override public String toString() { if (this.hasNonNullValue()) @@ -77,11 +82,19 @@ else if (!this.isEmpty()) return "no stats for this column"; } + /** + * @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0 + */ + @Deprecated public void updateStats(Binary min_value, Binary max_value) { if (min.compareTo(min_value) > 0) { min = min_value.copy(); } if (max.compareTo(max_value) < 0) { max = max_value.copy(); } } + /** + * @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0 + */ + @Deprecated public void initializeStats(Binary min_value, Binary max_value) { min = min_value.copy(); max = max_value.copy(); @@ -98,14 +111,26 @@ public Binary genericGetMax() { return max; } + /** + * @deprecated use {@link #genericGetMax()}, will be removed in 2.0.0 + */ + @Deprecated public Binary getMax() { return max; } + /** + * @deprecated use {@link #genericGetMin()}, will be removed in 2.0.0 + */ + @Deprecated public Binary getMin() { return min; } + /** + * @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0 + */ + @Deprecated public void setMinMax(Binary min, Binary max) { this.max = max; this.min = min; diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java index 1d02c74f83..22c23933bd 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java @@ -61,6 +61,11 @@ public byte[] getMinBytes() { return BytesUtils.booleanToBytes(min); } + @Override + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (2 < size); + } + @Override public String toString() { if (this.hasNonNullValue()) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java index 9d94439e64..d67a550a6f 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java @@ -61,6 +61,11 @@ public byte[] getMinBytes() { return BytesUtils.longToBytes(Double.doubleToLongBits(min)); } + @Override + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (16 < size); + } + @Override public String toString() { if(this.hasNonNullValue()) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java index c164cf5ad3..dffc2077ed 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java @@ -61,6 +61,11 @@ public byte[] getMinBytes() { return BytesUtils.intToBytes(Float.floatToIntBits(min)); } + @Override + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (8 < size); + } + @Override public String toString() { if (this.hasNonNullValue()) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java index 8deb28af12..a5d7ba196e 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java @@ -61,6 +61,11 @@ public byte[] getMinBytes() { return BytesUtils.intToBytes(min); } + @Override + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (8 < size); + } + @Override public String toString() { if (this.hasNonNullValue()) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java index a8c177ee52..f7971efdd8 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java @@ -61,6 +61,11 @@ public byte[] getMinBytes() { return BytesUtils.longToBytes(min); } + @Override + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (16 < size); + } + @Override public String toString() { if (this.hasNonNullValue()) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java index 5424414cc8..30153c0743 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java @@ -190,6 +190,14 @@ public void mergeStatistics(Statistics stats) { */ abstract public byte[] getMinBytes(); + /** + * Abstract method to return whether the min and max values fit in the given + * size. + * @param size a size in bytes + * @return true iff the min and max values are less than size bytes + */ + abstract public boolean isSmallerThan(long size); + /** * toString() to display min, max, num_nulls in a string */ diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 75b07fd85a..78c11b988a 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -77,6 +77,7 @@ public class ParquetMetadataConverter { public static final MetadataFilter NO_FILTER = new NoFilter(); public static final MetadataFilter SKIP_ROW_GROUPS = new SkipMetadataFilter(); + public static final long MAX_STATS_SIZE = 4096; // limit stats to 4k private static final Log LOG = Log.getLog(ParquetMetadataConverter.class); @@ -284,7 +285,7 @@ dataPageType, getEncoding(encoding), public static Statistics toParquetStatistics( org.apache.parquet.column.statistics.Statistics statistics) { Statistics stats = new Statistics(); - if (!statistics.isEmpty()) { + if (!statistics.isEmpty() && statistics.isSmallerThan(MAX_STATS_SIZE)) { stats.setNull_count(statistics.getNumNulls()); if (statistics.hasNonNullValue()) { stats.setMax(statistics.getMaxBytes()); @@ -293,6 +294,7 @@ public static Statistics toParquetStatistics( } return stats; } + /** * @deprecated Replaced by {@link #fromParquetStatistics( * String createdBy, Statistics statistics, PrimitiveTypeName type)} diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index b9cfde7757..3c888c37d9 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -45,12 +45,21 @@ import java.util.TreeSet; import com.google.common.collect.Sets; +import org.apache.parquet.Version; +import org.apache.parquet.bytes.BytesUtils; import org.apache.parquet.column.statistics.BinaryStatistics; +import org.apache.parquet.column.statistics.BooleanStatistics; +import org.apache.parquet.column.statistics.DoubleStatistics; +import org.apache.parquet.column.statistics.FloatStatistics; +import org.apache.parquet.column.statistics.IntStatistics; +import org.apache.parquet.column.statistics.LongStatistics; +import org.apache.parquet.column.statistics.Statistics; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.io.api.Binary; import org.junit.Assert; import org.junit.Test; @@ -357,5 +366,153 @@ public void testEncodingsCache() { assertEquals("java.util.Collections$UnmodifiableSet", res1.getClass().getName()); assertEquals("java.util.Collections$UnmodifiableSet", res2.getClass().getName()); assertEquals("java.util.Collections$UnmodifiableSet", res3.getClass().getName()); - } + } + + @Test + public void testBinaryStats() { + // make fake stats and verify the size check + BinaryStatistics stats = new BinaryStatistics(); + stats.incrementNumNulls(3004); + byte[] min = new byte[904]; + byte[] max = new byte[2388]; + stats.updateStats(Binary.fromConstantByteArray(min)); + stats.updateStats(Binary.fromConstantByteArray(max)); + long totalLen = min.length + max.length; + Assert.assertFalse("Should not be smaller than min + max size", + stats.isSmallerThan(totalLen)); + Assert.assertTrue("Should be smaller than min + max size + 1", + stats.isSmallerThan(totalLen + 1)); + + org.apache.parquet.format.Statistics formatStats = + ParquetMetadataConverter.toParquetStatistics(stats); + + Assert.assertArrayEquals("Min should match", min, formatStats.getMin()); + Assert.assertArrayEquals("Max should match", max, formatStats.getMax()); + Assert.assertEquals("Num nulls should match", + 3004, formatStats.getNull_count()); + + // convert to empty stats because the values are too large + stats.setMinMaxFromBytes(max, max); + + formatStats = ParquetMetadataConverter.toParquetStatistics(stats); + + Assert.assertFalse("Min should not be set", formatStats.isSetMin()); + Assert.assertFalse("Max should not be set", formatStats.isSetMax()); + Assert.assertFalse("Num nulls should not be set", + formatStats.isSetNull_count()); + + Statistics roundTripStats = ParquetMetadataConverter.fromParquetStatistics( + Version.FULL_VERSION, formatStats, PrimitiveTypeName.BINARY); + + Assert.assertTrue(roundTripStats.isEmpty()); + } + + @Test + public void testIntegerStats() { + // make fake stats and verify the size check + IntStatistics stats = new IntStatistics(); + stats.incrementNumNulls(3004); + int min = Integer.MIN_VALUE; + int max = Integer.MAX_VALUE; + stats.updateStats(min); + stats.updateStats(max); + + org.apache.parquet.format.Statistics formatStats = + ParquetMetadataConverter.toParquetStatistics(stats); + + Assert.assertEquals("Min should match", + min, BytesUtils.bytesToInt(formatStats.getMin())); + Assert.assertEquals("Max should match", + max, BytesUtils.bytesToInt(formatStats.getMax())); + Assert.assertEquals("Num nulls should match", + 3004, formatStats.getNull_count()); + } + + @Test + public void testLongStats() { + // make fake stats and verify the size check + LongStatistics stats = new LongStatistics(); + stats.incrementNumNulls(3004); + long min = Long.MIN_VALUE; + long max = Long.MAX_VALUE; + stats.updateStats(min); + stats.updateStats(max); + + org.apache.parquet.format.Statistics formatStats = + ParquetMetadataConverter.toParquetStatistics(stats); + + Assert.assertEquals("Min should match", + min, BytesUtils.bytesToLong(formatStats.getMin())); + Assert.assertEquals("Max should match", + max, BytesUtils.bytesToLong(formatStats.getMax())); + Assert.assertEquals("Num nulls should match", + 3004, formatStats.getNull_count()); + } + + @Test + public void testFloatStats() { + // make fake stats and verify the size check + FloatStatistics stats = new FloatStatistics(); + stats.incrementNumNulls(3004); + float min = Float.MIN_VALUE; + float max = Float.MAX_VALUE; + stats.updateStats(min); + stats.updateStats(max); + + org.apache.parquet.format.Statistics formatStats = + ParquetMetadataConverter.toParquetStatistics(stats); + + Assert.assertEquals("Min should match", + min, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMin())), + 0.000001); + Assert.assertEquals("Max should match", + max, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMax())), + 0.000001); + Assert.assertEquals("Num nulls should match", + 3004, formatStats.getNull_count()); + } + + @Test + public void testDoubleStats() { + // make fake stats and verify the size check + DoubleStatistics stats = new DoubleStatistics(); + stats.incrementNumNulls(3004); + double min = Double.MIN_VALUE; + double max = Double.MAX_VALUE; + stats.updateStats(min); + stats.updateStats(max); + + org.apache.parquet.format.Statistics formatStats = + ParquetMetadataConverter.toParquetStatistics(stats); + + Assert.assertEquals("Min should match", + min, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMin())), + 0.000001); + Assert.assertEquals("Max should match", + max, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMax())), + 0.000001); + Assert.assertEquals("Num nulls should match", + 3004, formatStats.getNull_count()); + } + + @Test + public void testBooleanStats() { + // make fake stats and verify the size check + BooleanStatistics stats = new BooleanStatistics(); + stats.incrementNumNulls(3004); + boolean min = Boolean.FALSE; + boolean max = Boolean.TRUE; + stats.updateStats(min); + stats.updateStats(max); + + org.apache.parquet.format.Statistics formatStats = + ParquetMetadataConverter.toParquetStatistics(stats); + + Assert.assertEquals("Min should match", + min, BytesUtils.bytesToBool(formatStats.getMin())); + Assert.assertEquals("Max should match", + max, BytesUtils.bytesToBool(formatStats.getMax())); + Assert.assertEquals("Num nulls should match", + 3004, formatStats.getNull_count()); + } } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java index 5bc060d2fa..d157cc3719 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java @@ -282,6 +282,15 @@ private void validateStatsForPage(DataPage page, DictionaryPage dict, ColumnDesc PrimitiveConverter converter = getValidatingConverter(page, desc.getType()); Statistics stats = getStatisticsFromPageHeader(page); + if (stats.isEmpty()) { + // stats are empty if num nulls = 0 and there are no non-null values + // this happens if stats are not written (e.g., when stats are too big) + System.err.println(String.format( + "No stats written for page=%s col=%s", + page, Arrays.toString(desc.getPath()))); + return; + } + long numNulls = 0; ColumnReaderImpl column = new ColumnReaderImpl(desc, reader, converter, null); for (int i = 0; i < reader.getTotalValueCount(); i += 1) { From 61e05d92b37089d1d1f70004251dbbc11e978c2d Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Fri, 18 Dec 2015 09:21:46 -0800 Subject: [PATCH 2/2] PARQUET-372: Add comment to explain not truncating values. --- .../parquet/format/converter/ParquetMetadataConverter.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 78c11b988a..9eb471f26c 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -285,6 +285,10 @@ dataPageType, getEncoding(encoding), public static Statistics toParquetStatistics( org.apache.parquet.column.statistics.Statistics statistics) { Statistics stats = new Statistics(); + // Don't write stats larger than the max size rather than truncating. The + // rationale is that some engines may use the minimum value in the page as + // the true minimum for aggregations and there is no way to mark that a + // value has been truncated and is a lower bound and not in the page. if (!statistics.isEmpty() && statistics.isSmallerThan(MAX_STATS_SIZE)) { stats.setNull_count(statistics.getNumNulls()); if (statistics.hasNonNullValue()) {