Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,72 @@ public Statistics<?> build() {
}
}

// Builder for FLOAT type to handle special cases of min/max values like NaN, -0.0, and 0.0
private static class FloatBuilder extends Builder {
public FloatBuilder(PrimitiveTypeName type) {
super(type);
assert type == PrimitiveTypeName.FLOAT;
}

@Override
public Statistics<?> build() {
FloatStatistics stats = (FloatStatistics) super.build();
if (stats.hasNonNullValue()) {
Float min = stats.genericGetMin();
Float max = stats.genericGetMax();
// Drop min/max values in case of NaN as the sorting order of values is undefined for this case
if (min.isNaN() || max.isNaN()) {
stats.setMinMax(0.0f, 0.0f);
((Statistics<?>) stats).hasNonNullValue = false;
} else {
// Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped
if (Float.compare(min, 0.0f) == 0) {
min = -0.0f;
stats.setMinMax(min, max);
}
if (Float.compare(max, -0.0f) == 0) {
max = 0.0f;
stats.setMinMax(min, max);
}
}
}
return stats;
}
}

// Builder for DOUBLE type to handle special cases of min/max values like NaN, -0.0, and 0.0
private static class DoubleBuilder extends Builder {
public DoubleBuilder(PrimitiveTypeName type) {
super(type);
assert type == PrimitiveTypeName.DOUBLE;
}

@Override
public Statistics<?> build() {
DoubleStatistics stats = (DoubleStatistics) super.build();
if (stats.hasNonNullValue()) {
Double min = stats.genericGetMin();
Double max = stats.genericGetMax();
// Drop min/max values in case of NaN as the sorting order of values is undefined for this case
if (min.isNaN() || max.isNaN()) {
stats.setMinMax(0.0, 0.0);
((Statistics<?>) stats).hasNonNullValue = false;
} else {
// Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped
if (Double.compare(min, 0.0) == 0) {
min = -0.0;
stats.setMinMax(min, max);
}
if (Double.compare(max, -0.0) == 0) {
max = 0.0;
stats.setMinMax(min, max);
}
}
}
return stats;
}
}

private boolean hasNonNullValue;
private long num_nulls;

Expand Down Expand Up @@ -112,8 +178,15 @@ public static Statistics getStatsBasedOnType(PrimitiveTypeName type) {
* type of the column
* @return builder to create new statistics object
*/
public static Builder getBuilder(PrimitiveTypeName type) {
return new Builder(type);
public static Builder getBuilderForReading(PrimitiveTypeName type) {
switch (type) {
case FLOAT:
return new FloatBuilder(type);
case DOUBLE:
return new DoubleBuilder(type);
default:
return new Builder(type);
}
}

/**
Expand Down Expand Up @@ -221,7 +294,7 @@ public void mergeStatistics(Statistics stats) {
* Abstract method to set min and max values from byte arrays.
* @param minBytes byte array to set the min value to
* @param maxBytes byte array to set the max value to
* @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead.
* @deprecated will be removed in 2.0.0. Use {@link #getBuilderForReading(PrimitiveType)} instead.
*/
@Deprecated
abstract public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes);
Expand Down Expand Up @@ -283,7 +356,7 @@ public long getNumNulls() {
*
* @param nulls
* null count to set the count to
* @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead.
* @deprecated will be removed in 2.0.0. Use {@link #getBuilderForReading(PrimitiveType)} instead.
*/
@Deprecated
public void setNumNulls(long nulls) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,25 @@
*/
package org.apache.parquet.column.statistics;

import static java.lang.Double.doubleToLongBits;
import static java.lang.Float.floatToIntBits;
import static org.apache.parquet.bytes.BytesUtils.intToBytes;
import static org.apache.parquet.bytes.BytesUtils.longToBytes;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT96;
import static org.junit.Assert.*;

import java.nio.ByteBuffer;

import org.junit.Test;

import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;

public class TestStatistics {
private int[] integerArray;
Expand Down Expand Up @@ -585,4 +597,142 @@ private void testMergingStringStats() {
assertEquals(stats.getMax(), Binary.fromString("zzzz"));
assertEquals(stats.getMin(), Binary.fromString(""));
}

@Test
public void testBuilder() {
testBuilder(BOOLEAN, false, new byte[] { 0 }, true, new byte[] { 1 });
testBuilder(INT32, -42, intToBytes(-42), 42, intToBytes(42));
testBuilder(INT64, -42l, longToBytes(-42), 42l, longToBytes(42));
testBuilder(FLOAT, -42.0f, intToBytes(floatToIntBits(-42.0f)), 42.0f,
intToBytes(floatToIntBits(42.0f)));
testBuilder(DOUBLE, -42.0, longToBytes(doubleToLongBits(-42.0)), 42.0,
longToBytes(Double.doubleToLongBits(42.0f)));

byte[] min = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 };
byte[] max = { 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 };
testBuilder(INT96, Binary.fromConstantByteArray(min), min,
Binary.fromConstantByteArray(max), max);
testBuilder(FIXED_LEN_BYTE_ARRAY, Binary.fromConstantByteArray(min),
min,
Binary.fromConstantByteArray(max), max);
testBuilder(BINARY, Binary.fromConstantByteArray(min), min,
Binary.fromConstantByteArray(max), max);
}

private void testBuilder(PrimitiveTypeName type, Object min, byte[] minBytes, Object max, byte[] maxBytes) {
Statistics.Builder builder = Statistics.getBuilderForReading(type);
Statistics<?> stats = builder.build();
assertTrue(stats.isEmpty());
assertFalse(stats.isNumNullsSet());
assertFalse(stats.hasNonNullValue());

builder = Statistics.getBuilderForReading(type);
stats = builder.withNumNulls(0).withMin(minBytes).build();
assertFalse(stats.isEmpty());
assertTrue(stats.isNumNullsSet());
assertFalse(stats.hasNonNullValue());
assertEquals(0, stats.getNumNulls());

builder = Statistics.getBuilderForReading(type);
stats = builder.withNumNulls(11).withMax(maxBytes).build();
assertFalse(stats.isEmpty());
assertTrue(stats.isNumNullsSet());
assertFalse(stats.hasNonNullValue());
assertEquals(11, stats.getNumNulls());

builder = Statistics.getBuilderForReading(type);
stats = builder.withNumNulls(42).withMin(minBytes).withMax(maxBytes).build();
assertFalse(stats.isEmpty());
assertTrue(stats.isNumNullsSet());
assertTrue(stats.hasNonNullValue());
assertEquals(42, stats.getNumNulls());
assertEquals(min, stats.genericGetMin());
assertEquals(max, stats.genericGetMax());
}

@Test
public void testSpecBuilderForFloat() {
PrimitiveTypeName type = FLOAT;
Statistics.Builder builder = Statistics.getBuilderForReading(type);
Statistics<?> stats = builder.withMin(intToBytes(floatToIntBits(Float.NaN)))
.withMax(intToBytes(floatToIntBits(42.0f))).withNumNulls(0).build();
assertTrue(stats.isNumNullsSet());
assertEquals(0, stats.getNumNulls());
assertFalse(stats.hasNonNullValue());

builder = Statistics.getBuilderForReading(type);
stats = builder.withMin(intToBytes(floatToIntBits(-42.0f)))
.withMax(intToBytes(floatToIntBits(Float.NaN))).withNumNulls(11).build();
assertTrue(stats.isNumNullsSet());
assertEquals(11, stats.getNumNulls());
assertFalse(stats.hasNonNullValue());

builder = Statistics.getBuilderForReading(type);
stats = builder.withMin(intToBytes(floatToIntBits(Float.NaN)))
.withMax(intToBytes(floatToIntBits(Float.NaN))).withNumNulls(42).build();
assertTrue(stats.isNumNullsSet());
assertEquals(42, stats.getNumNulls());
assertFalse(stats.hasNonNullValue());

builder = Statistics.getBuilderForReading(type);
stats = builder.withMin(intToBytes(floatToIntBits(0.0f)))
.withMax(intToBytes(floatToIntBits(42.0f))).build();
assertEquals(0, Float.compare(-0.0f, (Float) stats.genericGetMin()));
assertEquals(0, Float.compare(42.0f, (Float) stats.genericGetMax()));

builder = Statistics.getBuilderForReading(type);
stats = builder.withMin(intToBytes(floatToIntBits(-42.0f)))
.withMax(intToBytes(floatToIntBits(-0.0f))).build();
assertEquals(0, Float.compare(-42.0f, (Float) stats.genericGetMin()));
assertEquals(0, Float.compare(0.0f, (Float) stats.genericGetMax()));

builder = Statistics.getBuilderForReading(type);
stats = builder.withMin(intToBytes(floatToIntBits(0.0f)))
.withMax(intToBytes(floatToIntBits(-0.0f))).build();
assertEquals(0, Float.compare(-0.0f, (Float) stats.genericGetMin()));
assertEquals(0, Float.compare(0.0f, (Float) stats.genericGetMax()));
}

@Test
public void testSpecBuilderForDouble() {
PrimitiveTypeName type = DOUBLE;
Statistics.Builder builder = Statistics.getBuilderForReading(type);
Statistics<?> stats = builder.withMin(longToBytes(doubleToLongBits(Double.NaN)))
.withMax(longToBytes(doubleToLongBits(42.0))).withNumNulls(0).build();
assertTrue(stats.isNumNullsSet());
assertEquals(0, stats.getNumNulls());
assertFalse(stats.hasNonNullValue());

builder = Statistics.getBuilderForReading(type);
stats = builder.withMin(longToBytes(doubleToLongBits(-42.0)))
.withMax(longToBytes(doubleToLongBits(Double.NaN))).withNumNulls(11).build();
assertTrue(stats.isNumNullsSet());
assertEquals(11, stats.getNumNulls());
assertFalse(stats.hasNonNullValue());

builder = Statistics.getBuilderForReading(type);
stats = builder.withMin(longToBytes(doubleToLongBits(Double.NaN)))
.withMax(longToBytes(doubleToLongBits(Double.NaN))).withNumNulls(42).build();
assertTrue(stats.isNumNullsSet());
assertEquals(42, stats.getNumNulls());
assertFalse(stats.hasNonNullValue());

builder = Statistics.getBuilderForReading(type);
stats = builder.withMin(longToBytes(doubleToLongBits(0.0)))
.withMax(longToBytes(doubleToLongBits(42.0))).build();
assertEquals(0, Double.compare(-0.0, (Double) stats.genericGetMin()));
assertEquals(0, Double.compare(42.0, (Double) stats.genericGetMax()));

builder = Statistics.getBuilderForReading(type);
stats = builder.withMin(longToBytes(doubleToLongBits(-42.0)))
.withMax(longToBytes(doubleToLongBits(-0.0))).build();
assertEquals(0, Double.compare(-42.0, (Double) stats.genericGetMin()));
assertEquals(0, Double.compare(0.0, (Double) stats.genericGetMax()));

builder = Statistics.getBuilderForReading(type);
stats = builder.withMin(longToBytes(doubleToLongBits(0.0)))
.withMax(longToBytes(doubleToLongBits(-0.0))).build();
assertEquals(0, Double.compare(-0.0, (Double) stats.genericGetMin()));
assertEquals(0, Double.compare(0.0, (Double) stats.genericGetMax()));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist
(String createdBy, Statistics statistics, PrimitiveTypeName type, SortOrder typeSortOrder) {
// create stats object based on the column type
org.apache.parquet.column.statistics.Statistics.Builder statsBuilder =
org.apache.parquet.column.statistics.Statistics.getBuilder(type);
org.apache.parquet.column.statistics.Statistics.getBuilderForReading(type);
// If there was no statistics written to the footer, create an empty Statistics object and return

// NOTE: See docs in CorruptStatistics for explanation of why this check is needed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,10 @@ private static ColumnChunkMetaData getDoubleColumnMeta(org.apache.parquet.column
private static final IntStatistics intStats = new IntStatistics();
private static final IntStatistics nullIntStats = new IntStatistics();
private static final org.apache.parquet.column.statistics.Statistics<?> emptyIntStats = org.apache.parquet.column.statistics.Statistics
.getBuilder(PrimitiveTypeName.INT32).build();
.getBuilderForReading(PrimitiveTypeName.INT32).build();
private static final DoubleStatistics doubleStats = new DoubleStatistics();
private static final org.apache.parquet.column.statistics.Statistics<?> missingMinMaxDoubleStats = org.apache.parquet.column.statistics.Statistics
.getBuilder(PrimitiveTypeName.DOUBLE).withNumNulls(100).build();
.getBuilderForReading(PrimitiveTypeName.DOUBLE).withNumNulls(100).build();

static {
intStats.setMinMax(10, 100);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ public void test() throws Exception {
int v = 3;
BytesInput definitionLevels = BytesInput.fromInt(d);
BytesInput repetitionLevels = BytesInput.fromInt(r);
Statistics<?> statistics = Statistics.getBuilder(PrimitiveTypeName.BINARY).build();
Statistics<?> statistics = Statistics.getBuilderForReading(PrimitiveTypeName.BINARY).build();
BytesInput data = BytesInput.fromInt(v);
int rowCount = 5;
int nullCount = 1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ public class TestParquetFileWriter {
private static final CompressionCodecName CODEC = CompressionCodecName.UNCOMPRESSED;

private static final org.apache.parquet.column.statistics.Statistics<?> EMPTY_STATS = org.apache.parquet.column.statistics.Statistics
.getBuilder(PrimitiveTypeName.BINARY).build();
.getBuilderForReading(PrimitiveTypeName.BINARY).build();

private String writeSchema;

Expand Down