diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java index e8439f0027..c319b4adb0 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java @@ -67,6 +67,11 @@ public byte[] getMinBytes() { return min == null ? null : min.getBytes(); } + @Override + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || ((min.length() + max.length()) < size); + } + @Override public String toString() { if (this.hasNonNullValue()) @@ -77,11 +82,19 @@ else if (!this.isEmpty()) return "no stats for this column"; } + /** + * @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0 + */ + @Deprecated public void updateStats(Binary min_value, Binary max_value) { if (min.compareTo(min_value) > 0) { min = min_value.copy(); } if (max.compareTo(max_value) < 0) { max = max_value.copy(); } } + /** + * @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0 + */ + @Deprecated public void initializeStats(Binary min_value, Binary max_value) { min = min_value.copy(); max = max_value.copy(); @@ -98,14 +111,26 @@ public Binary genericGetMax() { return max; } + /** + * @deprecated use {@link #genericGetMax()}, will be removed in 2.0.0 + */ + @Deprecated public Binary getMax() { return max; } + /** + * @deprecated use {@link #genericGetMin()}, will be removed in 2.0.0 + */ + @Deprecated public Binary getMin() { return min; } + /** + * @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0 + */ + @Deprecated public void setMinMax(Binary min, Binary max) { this.max = max; this.min = min; diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java index 1d02c74f83..22c23933bd 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java @@ -61,6 +61,11 @@ public byte[] getMinBytes() { return BytesUtils.booleanToBytes(min); } + @Override + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (2 < size); + } + @Override public String toString() { if (this.hasNonNullValue()) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java index 9d94439e64..d67a550a6f 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java @@ -61,6 +61,11 @@ public byte[] getMinBytes() { return BytesUtils.longToBytes(Double.doubleToLongBits(min)); } + @Override + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (16 < size); + } + @Override public String toString() { if(this.hasNonNullValue()) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java index c164cf5ad3..dffc2077ed 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java @@ -61,6 +61,11 @@ public byte[] getMinBytes() { return BytesUtils.intToBytes(Float.floatToIntBits(min)); } + @Override + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (8 < size); + } + @Override public String toString() { if (this.hasNonNullValue()) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java index 8deb28af12..a5d7ba196e 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java @@ -61,6 +61,11 @@ public byte[] getMinBytes() { return BytesUtils.intToBytes(min); } + @Override + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (8 < size); + } + @Override public String toString() { if (this.hasNonNullValue()) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java index a8c177ee52..f7971efdd8 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java @@ -61,6 +61,11 @@ public byte[] getMinBytes() { return BytesUtils.longToBytes(min); } + @Override + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (16 < size); + } + @Override public String toString() { if (this.hasNonNullValue()) diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java index 5424414cc8..30153c0743 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java @@ -190,6 +190,14 @@ public void mergeStatistics(Statistics stats) { */ abstract public byte[] getMinBytes(); + /** + * Abstract method to return whether the min and max values fit in the given + * size. + * @param size a size in bytes + * @return true iff the min and max values are less than size bytes + */ + abstract public boolean isSmallerThan(long size); + /** * toString() to display min, max, num_nulls in a string */ diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 75b07fd85a..9eb471f26c 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -77,6 +77,7 @@ public class ParquetMetadataConverter { public static final MetadataFilter NO_FILTER = new NoFilter(); public static final MetadataFilter SKIP_ROW_GROUPS = new SkipMetadataFilter(); + public static final long MAX_STATS_SIZE = 4096; // limit stats to 4k private static final Log LOG = Log.getLog(ParquetMetadataConverter.class); @@ -284,7 +285,11 @@ dataPageType, getEncoding(encoding), public static Statistics toParquetStatistics( org.apache.parquet.column.statistics.Statistics statistics) { Statistics stats = new Statistics(); - if (!statistics.isEmpty()) { + // Don't write stats larger than the max size rather than truncating. The + // rationale is that some engines may use the minimum value in the page as + // the true minimum for aggregations and there is no way to mark that a + // value has been truncated and is a lower bound and not in the page. + if (!statistics.isEmpty() && statistics.isSmallerThan(MAX_STATS_SIZE)) { stats.setNull_count(statistics.getNumNulls()); if (statistics.hasNonNullValue()) { stats.setMax(statistics.getMaxBytes()); @@ -293,6 +298,7 @@ public static Statistics toParquetStatistics( } return stats; } + /** * @deprecated Replaced by {@link #fromParquetStatistics( * String createdBy, Statistics statistics, PrimitiveTypeName type)} diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index b9cfde7757..3c888c37d9 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -45,12 +45,21 @@ import java.util.TreeSet; import com.google.common.collect.Sets; +import org.apache.parquet.Version; +import org.apache.parquet.bytes.BytesUtils; import org.apache.parquet.column.statistics.BinaryStatistics; +import org.apache.parquet.column.statistics.BooleanStatistics; +import org.apache.parquet.column.statistics.DoubleStatistics; +import org.apache.parquet.column.statistics.FloatStatistics; +import org.apache.parquet.column.statistics.IntStatistics; +import org.apache.parquet.column.statistics.LongStatistics; +import org.apache.parquet.column.statistics.Statistics; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.io.api.Binary; import org.junit.Assert; import org.junit.Test; @@ -357,5 +366,153 @@ public void testEncodingsCache() { assertEquals("java.util.Collections$UnmodifiableSet", res1.getClass().getName()); assertEquals("java.util.Collections$UnmodifiableSet", res2.getClass().getName()); assertEquals("java.util.Collections$UnmodifiableSet", res3.getClass().getName()); - } + } + + @Test + public void testBinaryStats() { + // make fake stats and verify the size check + BinaryStatistics stats = new BinaryStatistics(); + stats.incrementNumNulls(3004); + byte[] min = new byte[904]; + byte[] max = new byte[2388]; + stats.updateStats(Binary.fromConstantByteArray(min)); + stats.updateStats(Binary.fromConstantByteArray(max)); + long totalLen = min.length + max.length; + Assert.assertFalse("Should not be smaller than min + max size", + stats.isSmallerThan(totalLen)); + Assert.assertTrue("Should be smaller than min + max size + 1", + stats.isSmallerThan(totalLen + 1)); + + org.apache.parquet.format.Statistics formatStats = + ParquetMetadataConverter.toParquetStatistics(stats); + + Assert.assertArrayEquals("Min should match", min, formatStats.getMin()); + Assert.assertArrayEquals("Max should match", max, formatStats.getMax()); + Assert.assertEquals("Num nulls should match", + 3004, formatStats.getNull_count()); + + // convert to empty stats because the values are too large + stats.setMinMaxFromBytes(max, max); + + formatStats = ParquetMetadataConverter.toParquetStatistics(stats); + + Assert.assertFalse("Min should not be set", formatStats.isSetMin()); + Assert.assertFalse("Max should not be set", formatStats.isSetMax()); + Assert.assertFalse("Num nulls should not be set", + formatStats.isSetNull_count()); + + Statistics roundTripStats = ParquetMetadataConverter.fromParquetStatistics( + Version.FULL_VERSION, formatStats, PrimitiveTypeName.BINARY); + + Assert.assertTrue(roundTripStats.isEmpty()); + } + + @Test + public void testIntegerStats() { + // make fake stats and verify the size check + IntStatistics stats = new IntStatistics(); + stats.incrementNumNulls(3004); + int min = Integer.MIN_VALUE; + int max = Integer.MAX_VALUE; + stats.updateStats(min); + stats.updateStats(max); + + org.apache.parquet.format.Statistics formatStats = + ParquetMetadataConverter.toParquetStatistics(stats); + + Assert.assertEquals("Min should match", + min, BytesUtils.bytesToInt(formatStats.getMin())); + Assert.assertEquals("Max should match", + max, BytesUtils.bytesToInt(formatStats.getMax())); + Assert.assertEquals("Num nulls should match", + 3004, formatStats.getNull_count()); + } + + @Test + public void testLongStats() { + // make fake stats and verify the size check + LongStatistics stats = new LongStatistics(); + stats.incrementNumNulls(3004); + long min = Long.MIN_VALUE; + long max = Long.MAX_VALUE; + stats.updateStats(min); + stats.updateStats(max); + + org.apache.parquet.format.Statistics formatStats = + ParquetMetadataConverter.toParquetStatistics(stats); + + Assert.assertEquals("Min should match", + min, BytesUtils.bytesToLong(formatStats.getMin())); + Assert.assertEquals("Max should match", + max, BytesUtils.bytesToLong(formatStats.getMax())); + Assert.assertEquals("Num nulls should match", + 3004, formatStats.getNull_count()); + } + + @Test + public void testFloatStats() { + // make fake stats and verify the size check + FloatStatistics stats = new FloatStatistics(); + stats.incrementNumNulls(3004); + float min = Float.MIN_VALUE; + float max = Float.MAX_VALUE; + stats.updateStats(min); + stats.updateStats(max); + + org.apache.parquet.format.Statistics formatStats = + ParquetMetadataConverter.toParquetStatistics(stats); + + Assert.assertEquals("Min should match", + min, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMin())), + 0.000001); + Assert.assertEquals("Max should match", + max, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMax())), + 0.000001); + Assert.assertEquals("Num nulls should match", + 3004, formatStats.getNull_count()); + } + + @Test + public void testDoubleStats() { + // make fake stats and verify the size check + DoubleStatistics stats = new DoubleStatistics(); + stats.incrementNumNulls(3004); + double min = Double.MIN_VALUE; + double max = Double.MAX_VALUE; + stats.updateStats(min); + stats.updateStats(max); + + org.apache.parquet.format.Statistics formatStats = + ParquetMetadataConverter.toParquetStatistics(stats); + + Assert.assertEquals("Min should match", + min, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMin())), + 0.000001); + Assert.assertEquals("Max should match", + max, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMax())), + 0.000001); + Assert.assertEquals("Num nulls should match", + 3004, formatStats.getNull_count()); + } + + @Test + public void testBooleanStats() { + // make fake stats and verify the size check + BooleanStatistics stats = new BooleanStatistics(); + stats.incrementNumNulls(3004); + boolean min = Boolean.FALSE; + boolean max = Boolean.TRUE; + stats.updateStats(min); + stats.updateStats(max); + + org.apache.parquet.format.Statistics formatStats = + ParquetMetadataConverter.toParquetStatistics(stats); + + Assert.assertEquals("Min should match", + min, BytesUtils.bytesToBool(formatStats.getMin())); + Assert.assertEquals("Max should match", + max, BytesUtils.bytesToBool(formatStats.getMax())); + Assert.assertEquals("Num nulls should match", + 3004, formatStats.getNull_count()); + } } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java index 5bc060d2fa..d157cc3719 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java @@ -282,6 +282,15 @@ private void validateStatsForPage(DataPage page, DictionaryPage dict, ColumnDesc PrimitiveConverter converter = getValidatingConverter(page, desc.getType()); Statistics stats = getStatisticsFromPageHeader(page); + if (stats.isEmpty()) { + // stats are empty if num nulls = 0 and there are no non-null values + // this happens if stats are not written (e.g., when stats are too big) + System.err.println(String.format( + "No stats written for page=%s col=%s", + page, Arrays.toString(desc.getPath()))); + return; + } + long numNulls = 0; ColumnReaderImpl column = new ColumnReaderImpl(desc, reader, converter, null); for (int i = 0; i < reader.getTotalValueCount(); i += 1) {