Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,72 @@ public Statistics<?> build() {
}
}

// Builder for FLOAT type to handle special cases of min/max values like NaN, -0.0, and 0.0
private static class FloatBuilder extends Builder {
public FloatBuilder(PrimitiveType type) {
super(type);
assert type.getPrimitiveTypeName() == PrimitiveTypeName.FLOAT;
}

@Override
public Statistics<?> build() {
FloatStatistics stats = (FloatStatistics) super.build();
if (stats.hasNonNullValue()) {
Float min = stats.genericGetMin();
Float max = stats.genericGetMax();
// Drop min/max values in case of NaN as the sorting order of values is undefined for this case
if (min.isNaN() || max.isNaN()) {
stats.setMinMax(0.0f, 0.0f);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems unnecessary. On the other hand, this is the default value when it's unset, so I'm fine with leaving this.

((Statistics<?>) stats).hasNonNullValue = false;
} else {
// Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped
if (Float.compare(min, 0.0f) == 0) {
min = -0.0f;
stats.setMinMax(min, max);
}
if (Float.compare(max, -0.0f) == 0) {
max = 0.0f;
stats.setMinMax(min, max);
}
}
}
return stats;
}
}

// Builder for DOUBLE type to handle special cases of min/max values like NaN, -0.0, and 0.0
private static class DoubleBuilder extends Builder {
public DoubleBuilder(PrimitiveType type) {
super(type);
assert type.getPrimitiveTypeName() == PrimitiveTypeName.DOUBLE;
}

@Override
public Statistics<?> build() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comments as for the float case.

DoubleStatistics stats = (DoubleStatistics) super.build();
if (stats.hasNonNullValue()) {
Double min = stats.genericGetMin();
Double max = stats.genericGetMax();
// Drop min/max values in case of NaN as the sorting order of values is undefined for this case
if (min.isNaN() || max.isNaN()) {
stats.setMinMax(0.0, 0.0);
((Statistics<?>) stats).hasNonNullValue = false;
} else {
// Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped
if (Double.compare(min, 0.0) == 0) {
min = -0.0;
stats.setMinMax(min, max);
}
if (Double.compare(max, -0.0) == 0) {
max = 0.0;
stats.setMinMax(min, max);
}
}
}
return stats;
}
}

private final PrimitiveType type;
private final PrimitiveComparator<T> comparator;
private boolean hasNonNullValue;
Expand Down Expand Up @@ -154,8 +220,15 @@ public static Statistics<?> createStats(Type type) {
* type of the column
* @return builder to create new statistics object
*/
public static Builder getBuilder(PrimitiveType type) {
return new Builder(type);
public static Builder getBuilderForReading(PrimitiveType type) {
switch (type.getPrimitiveTypeName()) {
case FLOAT:
return new FloatBuilder(type);
case DOUBLE:
return new DoubleBuilder(type);
default:
return new Builder(type);
}
}

/**
Expand Down Expand Up @@ -266,7 +339,7 @@ public void mergeStatistics(Statistics stats) {
* Abstract method to set min and max values from byte arrays.
* @param minBytes byte array to set the min value to
* @param maxBytes byte array to set the max value to
* @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead.
* @deprecated will be removed in 2.0.0. Use {@link #getBuilderForReading(PrimitiveType)} instead.
*/
@Deprecated
abstract public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes);
Expand Down Expand Up @@ -401,7 +474,7 @@ public long getNumNulls() {
*
* @param nulls
* null count to set the count to
* @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead.
* @deprecated will be removed in 2.0.0. Use {@link #getBuilderForReading(PrimitiveType)} instead.
*/
@Deprecated
public void setNumNulls(long nulls) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,24 @@
*/
package org.apache.parquet.column.statistics;

import static java.lang.Double.doubleToLongBits;
import static java.lang.Float.floatToIntBits;
import static org.apache.parquet.bytes.BytesUtils.intToBytes;
import static org.apache.parquet.bytes.BytesUtils.longToBytes;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT96;
import static org.junit.Assert.*;

import java.nio.ByteBuffer;
import java.util.Locale;

import org.junit.Test;

import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.OriginalType;
import org.apache.parquet.schema.PrimitiveType;
Expand Down Expand Up @@ -637,4 +648,142 @@ private void testMergingStringStats() {
assertEquals(stats.getMax(), Binary.fromString("zzzz"));
assertEquals(stats.getMin(), Binary.fromString(""));
}

@Test
public void testBuilder() {
testBuilder(Types.required(BOOLEAN).named("test_boolean"), false, new byte[] { 0 }, true, new byte[] { 1 });
testBuilder(Types.required(INT32).named("test_int32"), -42, intToBytes(-42), 42, intToBytes(42));
testBuilder(Types.required(INT64).named("test_int64"), -42l, longToBytes(-42), 42l, longToBytes(42));
testBuilder(Types.required(FLOAT).named("test_float"), -42.0f, intToBytes(floatToIntBits(-42.0f)), 42.0f,
intToBytes(floatToIntBits(42.0f)));
testBuilder(Types.required(DOUBLE).named("test_double"), -42.0, longToBytes(doubleToLongBits(-42.0)), 42.0,
longToBytes(Double.doubleToLongBits(42.0f)));

byte[] min = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 };
byte[] max = { 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 };
testBuilder(Types.required(INT96).named("test_int96"), Binary.fromConstantByteArray(min), min,
Binary.fromConstantByteArray(max), max);
testBuilder(Types.required(FIXED_LEN_BYTE_ARRAY).length(12).named("test_fixed"), Binary.fromConstantByteArray(min),
min,
Binary.fromConstantByteArray(max), max);
testBuilder(Types.required(BINARY).named("test_binary"), Binary.fromConstantByteArray(min), min,
Binary.fromConstantByteArray(max), max);
}

private void testBuilder(PrimitiveType type, Object min, byte[] minBytes, Object max, byte[] maxBytes) {
Statistics.Builder builder = Statistics.getBuilderForReading(type);
Statistics<?> stats = builder.build();
assertTrue(stats.isEmpty());
assertFalse(stats.isNumNullsSet());
assertFalse(stats.hasNonNullValue());

builder = Statistics.getBuilderForReading(type);
stats = builder.withNumNulls(0).withMin(minBytes).build();
assertFalse(stats.isEmpty());
assertTrue(stats.isNumNullsSet());
assertFalse(stats.hasNonNullValue());
assertEquals(0, stats.getNumNulls());

builder = Statistics.getBuilderForReading(type);
stats = builder.withNumNulls(11).withMax(maxBytes).build();
assertFalse(stats.isEmpty());
assertTrue(stats.isNumNullsSet());
assertFalse(stats.hasNonNullValue());
assertEquals(11, stats.getNumNulls());

builder = Statistics.getBuilderForReading(type);
stats = builder.withNumNulls(42).withMin(minBytes).withMax(maxBytes).build();
assertFalse(stats.isEmpty());
assertTrue(stats.isNumNullsSet());
assertTrue(stats.hasNonNullValue());
assertEquals(42, stats.getNumNulls());
assertEquals(min, stats.genericGetMin());
assertEquals(max, stats.genericGetMax());
}

@Test
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add test case for -0 and +0.

public void testSpecBuilderForFloat() {
PrimitiveType type = Types.required(FLOAT).named("test_float");
Statistics.Builder builder = Statistics.getBuilderForReading(type);
Statistics<?> stats = builder.withMin(intToBytes(floatToIntBits(Float.NaN)))
.withMax(intToBytes(floatToIntBits(42.0f))).withNumNulls(0).build();
assertTrue(stats.isNumNullsSet());
assertEquals(0, stats.getNumNulls());
assertFalse(stats.hasNonNullValue());

builder = Statistics.getBuilderForReading(type);
stats = builder.withMin(intToBytes(floatToIntBits(-42.0f)))
.withMax(intToBytes(floatToIntBits(Float.NaN))).withNumNulls(11).build();
assertTrue(stats.isNumNullsSet());
assertEquals(11, stats.getNumNulls());
assertFalse(stats.hasNonNullValue());

builder = Statistics.getBuilderForReading(type);
stats = builder.withMin(intToBytes(floatToIntBits(Float.NaN)))
.withMax(intToBytes(floatToIntBits(Float.NaN))).withNumNulls(42).build();
assertTrue(stats.isNumNullsSet());
assertEquals(42, stats.getNumNulls());
assertFalse(stats.hasNonNullValue());

builder = Statistics.getBuilderForReading(type);
stats = builder.withMin(intToBytes(floatToIntBits(0.0f)))
.withMax(intToBytes(floatToIntBits(42.0f))).build();
assertEquals(0, Float.compare(-0.0f, (Float) stats.genericGetMin()));
assertEquals(0, Float.compare(42.0f, (Float) stats.genericGetMax()));

builder = Statistics.getBuilderForReading(type);
stats = builder.withMin(intToBytes(floatToIntBits(-42.0f)))
.withMax(intToBytes(floatToIntBits(-0.0f))).build();
assertEquals(0, Float.compare(-42.0f, (Float) stats.genericGetMin()));
assertEquals(0, Float.compare(0.0f, (Float) stats.genericGetMax()));

builder = Statistics.getBuilderForReading(type);
stats = builder.withMin(intToBytes(floatToIntBits(0.0f)))
.withMax(intToBytes(floatToIntBits(-0.0f))).build();
assertEquals(0, Float.compare(-0.0f, (Float) stats.genericGetMin()));
assertEquals(0, Float.compare(0.0f, (Float) stats.genericGetMax()));
}

@Test
public void testSpecBuilderForDouble() {
PrimitiveType type = Types.required(DOUBLE).named("test_double");
Statistics.Builder builder = Statistics.getBuilderForReading(type);
Statistics<?> stats = builder.withMin(longToBytes(doubleToLongBits(Double.NaN)))
.withMax(longToBytes(doubleToLongBits(42.0))).withNumNulls(0).build();
assertTrue(stats.isNumNullsSet());
assertEquals(0, stats.getNumNulls());
assertFalse(stats.hasNonNullValue());

builder = Statistics.getBuilderForReading(type);
stats = builder.withMin(longToBytes(doubleToLongBits(-42.0)))
.withMax(longToBytes(doubleToLongBits(Double.NaN))).withNumNulls(11).build();
assertTrue(stats.isNumNullsSet());
assertEquals(11, stats.getNumNulls());
assertFalse(stats.hasNonNullValue());

builder = Statistics.getBuilderForReading(type);
stats = builder.withMin(longToBytes(doubleToLongBits(Double.NaN)))
.withMax(longToBytes(doubleToLongBits(Double.NaN))).withNumNulls(42).build();
assertTrue(stats.isNumNullsSet());
assertEquals(42, stats.getNumNulls());
assertFalse(stats.hasNonNullValue());

builder = Statistics.getBuilderForReading(type);
stats = builder.withMin(longToBytes(doubleToLongBits(0.0)))
.withMax(longToBytes(doubleToLongBits(42.0))).build();
assertEquals(0, Double.compare(-0.0, (Double) stats.genericGetMin()));
assertEquals(0, Double.compare(42.0, (Double) stats.genericGetMax()));

builder = Statistics.getBuilderForReading(type);
stats = builder.withMin(longToBytes(doubleToLongBits(-42.0)))
.withMax(longToBytes(doubleToLongBits(-0.0))).build();
assertEquals(0, Double.compare(-42.0, (Double) stats.genericGetMin()));
assertEquals(0, Double.compare(0.0, (Double) stats.genericGetMax()));

builder = Statistics.getBuilderForReading(type);
stats = builder.withMin(longToBytes(doubleToLongBits(0.0)))
.withMax(longToBytes(doubleToLongBits(-0.0))).build();
assertEquals(0, Double.compare(-0.0, (Double) stats.genericGetMin()));
assertEquals(0, Double.compare(0.0, (Double) stats.genericGetMax()));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist
(String createdBy, Statistics formatStats, PrimitiveType type, SortOrder typeSortOrder) {
// create stats object based on the column type
org.apache.parquet.column.statistics.Statistics.Builder statsBuilder =
org.apache.parquet.column.statistics.Statistics.getBuilder(type);
org.apache.parquet.column.statistics.Statistics.getBuilderForReading(type);

if (formatStats != null) {
// Use the new V2 min-max statistics over the former one if it is filled
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,10 @@ private static ColumnChunkMetaData getDoubleColumnMeta(org.apache.parquet.column
private static final IntStatistics intStats = new IntStatistics();
private static final IntStatistics nullIntStats = new IntStatistics();
private static final org.apache.parquet.column.statistics.Statistics<?> emptyIntStats = org.apache.parquet.column.statistics.Statistics
.getBuilder(Types.required(PrimitiveTypeName.INT32).named("test_int32")).build();
.getBuilderForReading(Types.required(PrimitiveTypeName.INT32).named("test_int32")).build();
private static final DoubleStatistics doubleStats = new DoubleStatistics();
private static final org.apache.parquet.column.statistics.Statistics<?> missingMinMaxDoubleStats = org.apache.parquet.column.statistics.Statistics
.getBuilder(Types.required(PrimitiveTypeName.DOUBLE).named("test_double")).withNumNulls(100).build();
.getBuilderForReading(Types.required(PrimitiveTypeName.DOUBLE).named("test_double")).withNumNulls(100).build();

static {
intStats.setMinMax(10, 100);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ public void test() throws Exception {
int v = 3;
BytesInput definitionLevels = BytesInput.fromInt(d);
BytesInput repetitionLevels = BytesInput.fromInt(r);
Statistics<?> statistics = Statistics.getBuilder(Types.required(PrimitiveTypeName.BINARY).named("test_binary"))
Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary"))
.build();
BytesInput data = BytesInput.fromInt(v);
int rowCount = 5;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ public class TestParquetFileWriter {
private static final CompressionCodecName CODEC = CompressionCodecName.UNCOMPRESSED;

private static final org.apache.parquet.column.statistics.Statistics<?> EMPTY_STATS = org.apache.parquet.column.statistics.Statistics
.getBuilder(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build();
.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build();

private String writeSchema;

Expand Down