diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java index 26c14c1357..ab07a0fecd 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java @@ -69,6 +69,72 @@ public Statistics build() { } } + // Builder for FLOAT type to handle special cases of min/max values like NaN, -0.0, and 0.0 + private static class FloatBuilder extends Builder { + public FloatBuilder(PrimitiveTypeName type) { + super(type); + assert type == PrimitiveTypeName.FLOAT; + } + + @Override + public Statistics build() { + FloatStatistics stats = (FloatStatistics) super.build(); + if (stats.hasNonNullValue()) { + Float min = stats.genericGetMin(); + Float max = stats.genericGetMax(); + // Drop min/max values in case of NaN as the sorting order of values is undefined for this case + if (min.isNaN() || max.isNaN()) { + stats.setMinMax(0.0f, 0.0f); + ((Statistics) stats).hasNonNullValue = false; + } else { + // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped + if (Float.compare(min, 0.0f) == 0) { + min = -0.0f; + stats.setMinMax(min, max); + } + if (Float.compare(max, -0.0f) == 0) { + max = 0.0f; + stats.setMinMax(min, max); + } + } + } + return stats; + } + } + + // Builder for DOUBLE type to handle special cases of min/max values like NaN, -0.0, and 0.0 + private static class DoubleBuilder extends Builder { + public DoubleBuilder(PrimitiveTypeName type) { + super(type); + assert type == PrimitiveTypeName.DOUBLE; + } + + @Override + public Statistics build() { + DoubleStatistics stats = (DoubleStatistics) super.build(); + if (stats.hasNonNullValue()) { + Double min = stats.genericGetMin(); + Double max = stats.genericGetMax(); + // Drop min/max values in case of NaN as the sorting order of values is undefined for this case + if (min.isNaN() || max.isNaN()) { + stats.setMinMax(0.0, 0.0); + ((Statistics) stats).hasNonNullValue = false; + } else { + // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped + if (Double.compare(min, 0.0) == 0) { + min = -0.0; + stats.setMinMax(min, max); + } + if (Double.compare(max, -0.0) == 0) { + max = 0.0; + stats.setMinMax(min, max); + } + } + } + return stats; + } + } + private boolean hasNonNullValue; private long num_nulls; @@ -112,8 +178,15 @@ public static Statistics getStatsBasedOnType(PrimitiveTypeName type) { * type of the column * @return builder to create new statistics object */ - public static Builder getBuilder(PrimitiveTypeName type) { - return new Builder(type); + public static Builder getBuilderForReading(PrimitiveTypeName type) { + switch (type) { + case FLOAT: + return new FloatBuilder(type); + case DOUBLE: + return new DoubleBuilder(type); + default: + return new Builder(type); + } } /** @@ -221,7 +294,7 @@ public void mergeStatistics(Statistics stats) { * Abstract method to set min and max values from byte arrays. * @param minBytes byte array to set the min value to * @param maxBytes byte array to set the max value to - * @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead. + * @deprecated will be removed in 2.0.0. Use {@link #getBuilderForReading(PrimitiveType)} instead. */ @Deprecated abstract public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes); @@ -283,7 +356,7 @@ public long getNumNulls() { * * @param nulls * null count to set the count to - * @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead. + * @deprecated will be removed in 2.0.0. Use {@link #getBuilderForReading(PrimitiveType)} instead. */ @Deprecated public void setNumNulls(long nulls) { diff --git a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java index cf4bf59af2..7d5278e204 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java @@ -18,13 +18,25 @@ */ package org.apache.parquet.column.statistics; +import static java.lang.Double.doubleToLongBits; +import static java.lang.Float.floatToIntBits; +import static org.apache.parquet.bytes.BytesUtils.intToBytes; +import static org.apache.parquet.bytes.BytesUtils.longToBytes; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT96; import static org.junit.Assert.*; import java.nio.ByteBuffer; import org.junit.Test; - import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; public class TestStatistics { private int[] integerArray; @@ -585,4 +597,142 @@ private void testMergingStringStats() { assertEquals(stats.getMax(), Binary.fromString("zzzz")); assertEquals(stats.getMin(), Binary.fromString("")); } + + @Test + public void testBuilder() { + testBuilder(BOOLEAN, false, new byte[] { 0 }, true, new byte[] { 1 }); + testBuilder(INT32, -42, intToBytes(-42), 42, intToBytes(42)); + testBuilder(INT64, -42l, longToBytes(-42), 42l, longToBytes(42)); + testBuilder(FLOAT, -42.0f, intToBytes(floatToIntBits(-42.0f)), 42.0f, + intToBytes(floatToIntBits(42.0f))); + testBuilder(DOUBLE, -42.0, longToBytes(doubleToLongBits(-42.0)), 42.0, + longToBytes(Double.doubleToLongBits(42.0f))); + + byte[] min = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }; + byte[] max = { 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 }; + testBuilder(INT96, Binary.fromConstantByteArray(min), min, + Binary.fromConstantByteArray(max), max); + testBuilder(FIXED_LEN_BYTE_ARRAY, Binary.fromConstantByteArray(min), + min, + Binary.fromConstantByteArray(max), max); + testBuilder(BINARY, Binary.fromConstantByteArray(min), min, + Binary.fromConstantByteArray(max), max); + } + + private void testBuilder(PrimitiveTypeName type, Object min, byte[] minBytes, Object max, byte[] maxBytes) { + Statistics.Builder builder = Statistics.getBuilderForReading(type); + Statistics stats = builder.build(); + assertTrue(stats.isEmpty()); + assertFalse(stats.isNumNullsSet()); + assertFalse(stats.hasNonNullValue()); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withNumNulls(0).withMin(minBytes).build(); + assertFalse(stats.isEmpty()); + assertTrue(stats.isNumNullsSet()); + assertFalse(stats.hasNonNullValue()); + assertEquals(0, stats.getNumNulls()); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withNumNulls(11).withMax(maxBytes).build(); + assertFalse(stats.isEmpty()); + assertTrue(stats.isNumNullsSet()); + assertFalse(stats.hasNonNullValue()); + assertEquals(11, stats.getNumNulls()); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withNumNulls(42).withMin(minBytes).withMax(maxBytes).build(); + assertFalse(stats.isEmpty()); + assertTrue(stats.isNumNullsSet()); + assertTrue(stats.hasNonNullValue()); + assertEquals(42, stats.getNumNulls()); + assertEquals(min, stats.genericGetMin()); + assertEquals(max, stats.genericGetMax()); + } + + @Test + public void testSpecBuilderForFloat() { + PrimitiveTypeName type = FLOAT; + Statistics.Builder builder = Statistics.getBuilderForReading(type); + Statistics stats = builder.withMin(intToBytes(floatToIntBits(Float.NaN))) + .withMax(intToBytes(floatToIntBits(42.0f))).withNumNulls(0).build(); + assertTrue(stats.isNumNullsSet()); + assertEquals(0, stats.getNumNulls()); + assertFalse(stats.hasNonNullValue()); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withMin(intToBytes(floatToIntBits(-42.0f))) + .withMax(intToBytes(floatToIntBits(Float.NaN))).withNumNulls(11).build(); + assertTrue(stats.isNumNullsSet()); + assertEquals(11, stats.getNumNulls()); + assertFalse(stats.hasNonNullValue()); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withMin(intToBytes(floatToIntBits(Float.NaN))) + .withMax(intToBytes(floatToIntBits(Float.NaN))).withNumNulls(42).build(); + assertTrue(stats.isNumNullsSet()); + assertEquals(42, stats.getNumNulls()); + assertFalse(stats.hasNonNullValue()); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withMin(intToBytes(floatToIntBits(0.0f))) + .withMax(intToBytes(floatToIntBits(42.0f))).build(); + assertEquals(0, Float.compare(-0.0f, (Float) stats.genericGetMin())); + assertEquals(0, Float.compare(42.0f, (Float) stats.genericGetMax())); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withMin(intToBytes(floatToIntBits(-42.0f))) + .withMax(intToBytes(floatToIntBits(-0.0f))).build(); + assertEquals(0, Float.compare(-42.0f, (Float) stats.genericGetMin())); + assertEquals(0, Float.compare(0.0f, (Float) stats.genericGetMax())); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withMin(intToBytes(floatToIntBits(0.0f))) + .withMax(intToBytes(floatToIntBits(-0.0f))).build(); + assertEquals(0, Float.compare(-0.0f, (Float) stats.genericGetMin())); + assertEquals(0, Float.compare(0.0f, (Float) stats.genericGetMax())); + } + + @Test + public void testSpecBuilderForDouble() { + PrimitiveTypeName type = DOUBLE; + Statistics.Builder builder = Statistics.getBuilderForReading(type); + Statistics stats = builder.withMin(longToBytes(doubleToLongBits(Double.NaN))) + .withMax(longToBytes(doubleToLongBits(42.0))).withNumNulls(0).build(); + assertTrue(stats.isNumNullsSet()); + assertEquals(0, stats.getNumNulls()); + assertFalse(stats.hasNonNullValue()); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withMin(longToBytes(doubleToLongBits(-42.0))) + .withMax(longToBytes(doubleToLongBits(Double.NaN))).withNumNulls(11).build(); + assertTrue(stats.isNumNullsSet()); + assertEquals(11, stats.getNumNulls()); + assertFalse(stats.hasNonNullValue()); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withMin(longToBytes(doubleToLongBits(Double.NaN))) + .withMax(longToBytes(doubleToLongBits(Double.NaN))).withNumNulls(42).build(); + assertTrue(stats.isNumNullsSet()); + assertEquals(42, stats.getNumNulls()); + assertFalse(stats.hasNonNullValue()); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withMin(longToBytes(doubleToLongBits(0.0))) + .withMax(longToBytes(doubleToLongBits(42.0))).build(); + assertEquals(0, Double.compare(-0.0, (Double) stats.genericGetMin())); + assertEquals(0, Double.compare(42.0, (Double) stats.genericGetMax())); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withMin(longToBytes(doubleToLongBits(-42.0))) + .withMax(longToBytes(doubleToLongBits(-0.0))).build(); + assertEquals(0, Double.compare(-42.0, (Double) stats.genericGetMin())); + assertEquals(0, Double.compare(0.0, (Double) stats.genericGetMax())); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withMin(longToBytes(doubleToLongBits(0.0))) + .withMax(longToBytes(doubleToLongBits(-0.0))).build(); + assertEquals(0, Double.compare(-0.0, (Double) stats.genericGetMin())); + assertEquals(0, Double.compare(0.0, (Double) stats.genericGetMax())); + } } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 9df5660460..9800dab318 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -338,7 +338,7 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist (String createdBy, Statistics statistics, PrimitiveTypeName type, SortOrder typeSortOrder) { // create stats object based on the column type org.apache.parquet.column.statistics.Statistics.Builder statsBuilder = - org.apache.parquet.column.statistics.Statistics.getBuilder(type); + org.apache.parquet.column.statistics.Statistics.getBuilderForReading(type); // If there was no statistics written to the footer, create an empty Statistics object and return // NOTE: See docs in CorruptStatistics for explanation of why this check is needed diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java index a0551a452d..cfc429aabf 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java @@ -90,10 +90,10 @@ private static ColumnChunkMetaData getDoubleColumnMeta(org.apache.parquet.column private static final IntStatistics intStats = new IntStatistics(); private static final IntStatistics nullIntStats = new IntStatistics(); private static final org.apache.parquet.column.statistics.Statistics emptyIntStats = org.apache.parquet.column.statistics.Statistics - .getBuilder(PrimitiveTypeName.INT32).build(); + .getBuilderForReading(PrimitiveTypeName.INT32).build(); private static final DoubleStatistics doubleStats = new DoubleStatistics(); private static final org.apache.parquet.column.statistics.Statistics missingMinMaxDoubleStats = org.apache.parquet.column.statistics.Statistics - .getBuilder(PrimitiveTypeName.DOUBLE).withNumNulls(100).build(); + .getBuilderForReading(PrimitiveTypeName.DOUBLE).withNumNulls(100).build(); static { intStats.setMinMax(10, 100); diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java index a83247f03b..80363848df 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java @@ -90,7 +90,7 @@ public void test() throws Exception { int v = 3; BytesInput definitionLevels = BytesInput.fromInt(d); BytesInput repetitionLevels = BytesInput.fromInt(r); - Statistics statistics = Statistics.getBuilder(PrimitiveTypeName.BINARY).build(); + Statistics statistics = Statistics.getBuilderForReading(PrimitiveTypeName.BINARY).build(); BytesInput data = BytesInput.fromInt(v); int rowCount = 5; int nullCount = 1; diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java index 24307b6f37..0bbdf637ec 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java @@ -94,7 +94,7 @@ public class TestParquetFileWriter { private static final CompressionCodecName CODEC = CompressionCodecName.UNCOMPRESSED; private static final org.apache.parquet.column.statistics.Statistics EMPTY_STATS = org.apache.parquet.column.statistics.Statistics - .getBuilder(PrimitiveTypeName.BINARY).build(); + .getBuilderForReading(PrimitiveTypeName.BINARY).build(); private String writeSchema;