diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java index a087c5f70e..6888ad61bc 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java @@ -73,6 +73,72 @@ public Statistics build() { } } + // Builder for FLOAT type to handle special cases of min/max values like NaN, -0.0, and 0.0 + private static class FloatBuilder extends Builder { + public FloatBuilder(PrimitiveType type) { + super(type); + assert type.getPrimitiveTypeName() == PrimitiveTypeName.FLOAT; + } + + @Override + public Statistics build() { + FloatStatistics stats = (FloatStatistics) super.build(); + if (stats.hasNonNullValue()) { + Float min = stats.genericGetMin(); + Float max = stats.genericGetMax(); + // Drop min/max values in case of NaN as the sorting order of values is undefined for this case + if (min.isNaN() || max.isNaN()) { + stats.setMinMax(0.0f, 0.0f); + ((Statistics) stats).hasNonNullValue = false; + } else { + // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped + if (Float.compare(min, 0.0f) == 0) { + min = -0.0f; + stats.setMinMax(min, max); + } + if (Float.compare(max, -0.0f) == 0) { + max = 0.0f; + stats.setMinMax(min, max); + } + } + } + return stats; + } + } + + // Builder for DOUBLE type to handle special cases of min/max values like NaN, -0.0, and 0.0 + private static class DoubleBuilder extends Builder { + public DoubleBuilder(PrimitiveType type) { + super(type); + assert type.getPrimitiveTypeName() == PrimitiveTypeName.DOUBLE; + } + + @Override + public Statistics build() { + DoubleStatistics stats = (DoubleStatistics) super.build(); + if (stats.hasNonNullValue()) { + Double min = stats.genericGetMin(); + Double max = stats.genericGetMax(); + // Drop min/max values in case of NaN as the sorting order of values is undefined for this case + if (min.isNaN() || max.isNaN()) { + stats.setMinMax(0.0, 0.0); + ((Statistics) stats).hasNonNullValue = false; + } else { + // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped + if (Double.compare(min, 0.0) == 0) { + min = -0.0; + stats.setMinMax(min, max); + } + if (Double.compare(max, -0.0) == 0) { + max = 0.0; + stats.setMinMax(min, max); + } + } + } + return stats; + } + } + private final PrimitiveType type; private final PrimitiveComparator comparator; private boolean hasNonNullValue; @@ -154,8 +220,15 @@ public static Statistics createStats(Type type) { * type of the column * @return builder to create new statistics object */ - public static Builder getBuilder(PrimitiveType type) { - return new Builder(type); + public static Builder getBuilderForReading(PrimitiveType type) { + switch (type.getPrimitiveTypeName()) { + case FLOAT: + return new FloatBuilder(type); + case DOUBLE: + return new DoubleBuilder(type); + default: + return new Builder(type); + } } /** @@ -266,7 +339,7 @@ public void mergeStatistics(Statistics stats) { * Abstract method to set min and max values from byte arrays. * @param minBytes byte array to set the min value to * @param maxBytes byte array to set the max value to - * @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead. + * @deprecated will be removed in 2.0.0. Use {@link #getBuilderForReading(PrimitiveType)} instead. */ @Deprecated abstract public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes); @@ -401,7 +474,7 @@ public long getNumNulls() { * * @param nulls * null count to set the count to - * @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead. + * @deprecated will be removed in 2.0.0. Use {@link #getBuilderForReading(PrimitiveType)} instead. */ @Deprecated public void setNumNulls(long nulls) { diff --git a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java index 5e5d5fd26e..4431ea4379 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java @@ -18,13 +18,24 @@ */ package org.apache.parquet.column.statistics; +import static java.lang.Double.doubleToLongBits; +import static java.lang.Float.floatToIntBits; +import static org.apache.parquet.bytes.BytesUtils.intToBytes; +import static org.apache.parquet.bytes.BytesUtils.longToBytes; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT96; import static org.junit.Assert.*; import java.nio.ByteBuffer; import java.util.Locale; import org.junit.Test; - import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType; @@ -637,4 +648,142 @@ private void testMergingStringStats() { assertEquals(stats.getMax(), Binary.fromString("zzzz")); assertEquals(stats.getMin(), Binary.fromString("")); } + + @Test + public void testBuilder() { + testBuilder(Types.required(BOOLEAN).named("test_boolean"), false, new byte[] { 0 }, true, new byte[] { 1 }); + testBuilder(Types.required(INT32).named("test_int32"), -42, intToBytes(-42), 42, intToBytes(42)); + testBuilder(Types.required(INT64).named("test_int64"), -42l, longToBytes(-42), 42l, longToBytes(42)); + testBuilder(Types.required(FLOAT).named("test_float"), -42.0f, intToBytes(floatToIntBits(-42.0f)), 42.0f, + intToBytes(floatToIntBits(42.0f))); + testBuilder(Types.required(DOUBLE).named("test_double"), -42.0, longToBytes(doubleToLongBits(-42.0)), 42.0, + longToBytes(Double.doubleToLongBits(42.0f))); + + byte[] min = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }; + byte[] max = { 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 }; + testBuilder(Types.required(INT96).named("test_int96"), Binary.fromConstantByteArray(min), min, + Binary.fromConstantByteArray(max), max); + testBuilder(Types.required(FIXED_LEN_BYTE_ARRAY).length(12).named("test_fixed"), Binary.fromConstantByteArray(min), + min, + Binary.fromConstantByteArray(max), max); + testBuilder(Types.required(BINARY).named("test_binary"), Binary.fromConstantByteArray(min), min, + Binary.fromConstantByteArray(max), max); + } + + private void testBuilder(PrimitiveType type, Object min, byte[] minBytes, Object max, byte[] maxBytes) { + Statistics.Builder builder = Statistics.getBuilderForReading(type); + Statistics stats = builder.build(); + assertTrue(stats.isEmpty()); + assertFalse(stats.isNumNullsSet()); + assertFalse(stats.hasNonNullValue()); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withNumNulls(0).withMin(minBytes).build(); + assertFalse(stats.isEmpty()); + assertTrue(stats.isNumNullsSet()); + assertFalse(stats.hasNonNullValue()); + assertEquals(0, stats.getNumNulls()); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withNumNulls(11).withMax(maxBytes).build(); + assertFalse(stats.isEmpty()); + assertTrue(stats.isNumNullsSet()); + assertFalse(stats.hasNonNullValue()); + assertEquals(11, stats.getNumNulls()); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withNumNulls(42).withMin(minBytes).withMax(maxBytes).build(); + assertFalse(stats.isEmpty()); + assertTrue(stats.isNumNullsSet()); + assertTrue(stats.hasNonNullValue()); + assertEquals(42, stats.getNumNulls()); + assertEquals(min, stats.genericGetMin()); + assertEquals(max, stats.genericGetMax()); + } + + @Test + public void testSpecBuilderForFloat() { + PrimitiveType type = Types.required(FLOAT).named("test_float"); + Statistics.Builder builder = Statistics.getBuilderForReading(type); + Statistics stats = builder.withMin(intToBytes(floatToIntBits(Float.NaN))) + .withMax(intToBytes(floatToIntBits(42.0f))).withNumNulls(0).build(); + assertTrue(stats.isNumNullsSet()); + assertEquals(0, stats.getNumNulls()); + assertFalse(stats.hasNonNullValue()); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withMin(intToBytes(floatToIntBits(-42.0f))) + .withMax(intToBytes(floatToIntBits(Float.NaN))).withNumNulls(11).build(); + assertTrue(stats.isNumNullsSet()); + assertEquals(11, stats.getNumNulls()); + assertFalse(stats.hasNonNullValue()); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withMin(intToBytes(floatToIntBits(Float.NaN))) + .withMax(intToBytes(floatToIntBits(Float.NaN))).withNumNulls(42).build(); + assertTrue(stats.isNumNullsSet()); + assertEquals(42, stats.getNumNulls()); + assertFalse(stats.hasNonNullValue()); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withMin(intToBytes(floatToIntBits(0.0f))) + .withMax(intToBytes(floatToIntBits(42.0f))).build(); + assertEquals(0, Float.compare(-0.0f, (Float) stats.genericGetMin())); + assertEquals(0, Float.compare(42.0f, (Float) stats.genericGetMax())); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withMin(intToBytes(floatToIntBits(-42.0f))) + .withMax(intToBytes(floatToIntBits(-0.0f))).build(); + assertEquals(0, Float.compare(-42.0f, (Float) stats.genericGetMin())); + assertEquals(0, Float.compare(0.0f, (Float) stats.genericGetMax())); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withMin(intToBytes(floatToIntBits(0.0f))) + .withMax(intToBytes(floatToIntBits(-0.0f))).build(); + assertEquals(0, Float.compare(-0.0f, (Float) stats.genericGetMin())); + assertEquals(0, Float.compare(0.0f, (Float) stats.genericGetMax())); + } + + @Test + public void testSpecBuilderForDouble() { + PrimitiveType type = Types.required(DOUBLE).named("test_double"); + Statistics.Builder builder = Statistics.getBuilderForReading(type); + Statistics stats = builder.withMin(longToBytes(doubleToLongBits(Double.NaN))) + .withMax(longToBytes(doubleToLongBits(42.0))).withNumNulls(0).build(); + assertTrue(stats.isNumNullsSet()); + assertEquals(0, stats.getNumNulls()); + assertFalse(stats.hasNonNullValue()); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withMin(longToBytes(doubleToLongBits(-42.0))) + .withMax(longToBytes(doubleToLongBits(Double.NaN))).withNumNulls(11).build(); + assertTrue(stats.isNumNullsSet()); + assertEquals(11, stats.getNumNulls()); + assertFalse(stats.hasNonNullValue()); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withMin(longToBytes(doubleToLongBits(Double.NaN))) + .withMax(longToBytes(doubleToLongBits(Double.NaN))).withNumNulls(42).build(); + assertTrue(stats.isNumNullsSet()); + assertEquals(42, stats.getNumNulls()); + assertFalse(stats.hasNonNullValue()); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withMin(longToBytes(doubleToLongBits(0.0))) + .withMax(longToBytes(doubleToLongBits(42.0))).build(); + assertEquals(0, Double.compare(-0.0, (Double) stats.genericGetMin())); + assertEquals(0, Double.compare(42.0, (Double) stats.genericGetMax())); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withMin(longToBytes(doubleToLongBits(-42.0))) + .withMax(longToBytes(doubleToLongBits(-0.0))).build(); + assertEquals(0, Double.compare(-42.0, (Double) stats.genericGetMin())); + assertEquals(0, Double.compare(0.0, (Double) stats.genericGetMax())); + + builder = Statistics.getBuilderForReading(type); + stats = builder.withMin(longToBytes(doubleToLongBits(0.0))) + .withMax(longToBytes(doubleToLongBits(-0.0))).build(); + assertEquals(0, Double.compare(-0.0, (Double) stats.genericGetMin())); + assertEquals(0, Double.compare(0.0, (Double) stats.genericGetMax())); + } } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 0daabb6fe0..bc435168b9 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -402,7 +402,7 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist (String createdBy, Statistics formatStats, PrimitiveType type, SortOrder typeSortOrder) { // create stats object based on the column type org.apache.parquet.column.statistics.Statistics.Builder statsBuilder = - org.apache.parquet.column.statistics.Statistics.getBuilder(type); + org.apache.parquet.column.statistics.Statistics.getBuilderForReading(type); if (formatStats != null) { // Use the new V2 min-max statistics over the former one if it is filled diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java index 6fdec2a6cd..97dd1695ae 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java @@ -90,10 +90,10 @@ private static ColumnChunkMetaData getDoubleColumnMeta(org.apache.parquet.column private static final IntStatistics intStats = new IntStatistics(); private static final IntStatistics nullIntStats = new IntStatistics(); private static final org.apache.parquet.column.statistics.Statistics emptyIntStats = org.apache.parquet.column.statistics.Statistics - .getBuilder(Types.required(PrimitiveTypeName.INT32).named("test_int32")).build(); + .getBuilderForReading(Types.required(PrimitiveTypeName.INT32).named("test_int32")).build(); private static final DoubleStatistics doubleStats = new DoubleStatistics(); private static final org.apache.parquet.column.statistics.Statistics missingMinMaxDoubleStats = org.apache.parquet.column.statistics.Statistics - .getBuilder(Types.required(PrimitiveTypeName.DOUBLE).named("test_double")).withNumNulls(100).build(); + .getBuilderForReading(Types.required(PrimitiveTypeName.DOUBLE).named("test_double")).withNumNulls(100).build(); static { intStats.setMinMax(10, 100); diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java index 0b7b9517a1..a5381f073b 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java @@ -93,7 +93,7 @@ public void test() throws Exception { int v = 3; BytesInput definitionLevels = BytesInput.fromInt(d); BytesInput repetitionLevels = BytesInput.fromInt(r); - Statistics statistics = Statistics.getBuilder(Types.required(PrimitiveTypeName.BINARY).named("test_binary")) + Statistics statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary")) .build(); BytesInput data = BytesInput.fromInt(v); int rowCount = 5; diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java index c73e569271..095b575c80 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java @@ -96,7 +96,7 @@ public class TestParquetFileWriter { private static final CompressionCodecName CODEC = CompressionCodecName.UNCOMPRESSED; private static final org.apache.parquet.column.statistics.Statistics EMPTY_STATS = org.apache.parquet.column.statistics.Statistics - .getBuilder(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build(); + .getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build(); private String writeSchema;