Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,122 +18,259 @@
*/
package org.apache.parquet.column.statistics;

import org.apache.parquet.filter2.predicate.ByteSignedness;
import org.apache.parquet.io.api.Binary;

/**
* BinaryStatistics: tracks statistics information on binary data columns.
* There are two sets of mins and maxes: those based on signed and unsigned byte comparisons.
* For example, given a set of Binary values {@code Binary.fromString("a")}, {@code Binary.fromString("é")},
* the signed min will be "é" as the first byte of the codepoint will be larger than 127
*/
public class BinaryStatistics extends Statistics<Binary> {

private Binary max;
private Binary min;
private Binary maxSigned;
private Binary minSigned;

private Binary minUnsigned;
private Binary maxUnsigned;

@Override
public void updateStats(Binary value) {
if (!this.hasNonNullValue()) {
initializeStats(value, value);
initializeStatsSigned(value, value);
initializeStatsUnsigned(value, value);
} else {
updateStatsSigned(value, value);
updateStatsUnsigned(value, value);
}
}

@Override
public void updateStatsSigned(Binary value) {
if (!this.hasNonNullValue()) {
initializeStatsSigned(value, value);
} else {
updateStatsSigned(value, value);
}
}

@Override
public void updateStatsUnsigned(Binary value) {
if (!this.hasNonNullValue()) {
initializeStatsUnsigned(value, value);
} else {
updateStats(value, value);
updateStatsUnsigned(value, value);
}
}

@Override
public void mergeStatisticsMinMax(Statistics stats) {
BinaryStatistics binaryStats = (BinaryStatistics)stats;
if (!this.hasNonNullValue()) {
initializeStats(binaryStats.getMin(), binaryStats.getMax());
initializeStatsSigned(binaryStats.genericGetMinSigned(), binaryStats.genericGetMaxSigned());
initializeStatsUnsigned(binaryStats.genericGetMinUnsigned(), binaryStats.genericGetMaxUnsigned());
} else {
updateStats(binaryStats.getMin(), binaryStats.getMax());
updateStatsSigned(binaryStats.genericGetMinSigned(), binaryStats.genericGetMaxSigned());
updateStatsUnsigned(binaryStats.genericGetMinUnsigned(), binaryStats.genericGetMaxUnsigned());
}
}

/**
* Sets min and max values, re-uses the byte[] passed in.
* Any changes made to byte[] will be reflected in min and max values as well.
* @param minBytes byte array to set the min value to
* @param maxBytes byte array to set the max value to
* Sets minSigned and maxSigned values, re-uses the byte[] passed in.
* Any changes made to byte[] will be reflected in minSigned and maxSigned values as well.
* @param minBytes byte array to set the minSigned value to
* @param maxBytes byte array to set the maxSigned value to
*/
@Override
public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes) {
max = Binary.fromReusedByteArray(maxBytes);
min = Binary.fromReusedByteArray(minBytes);
maxSigned = Binary.fromReusedByteArray(maxBytes);
minSigned = Binary.fromReusedByteArray(minBytes);
maxUnsigned = maxSigned.copy();
minUnsigned = minSigned.copy();
this.markAsNotEmpty();
}

@Override
public void setMinMaxSignedFromBytes(byte[] minBytes, byte[] maxBytes) {
this.minSigned = Binary.fromReusedByteArray(minBytes);
this.maxSigned = Binary.fromReusedByteArray(maxBytes);
}

@Override
public void setMinMaxUnsignedFromBytes(byte[] minBytes, byte[] maxBytes) {
this.minUnsigned = Binary.fromReusedByteArray(minBytes);
this.maxUnsigned = Binary.fromReusedByteArray(maxBytes);
}

/**
* Use either getMaxBytesSigned() or getMaxBytesUnsigned() directly instead.
*/
@Deprecated
@Override
public byte[] getMaxBytes() {
return max == null ? null : max.getBytes();
return getMaxBytesSigned();
}

/**
* Use either getMinBytesSigned() or getMinBytesUnsigned() directly instead.
*/
@Deprecated
@Override
public byte[] getMinBytes() {
return min == null ? null : min.getBytes();
return getMinBytesSigned();
}

public byte[] getMaxBytesSigned() {
return maxSigned == null ? null : maxSigned.getBytes();
}

@Override
public byte[] getMinBytesSigned() {
return minSigned == null ? null : minSigned.getBytes();
}

@Override
public byte[] getMaxBytesUnsigned() {
return maxUnsigned == null ? null : maxUnsigned.getBytes();
}

@Override
public byte[] getMinBytesUnsigned() {
return minUnsigned == null ? null : minUnsigned.getBytes();
}

@Override
public boolean isSmallerThan(long size) {
return !hasNonNullValue() || ((min.length() + max.length()) < size);
return !hasNonNullValue() || (((minSigned.length() + maxSigned.length()) < size) && ((minUnsigned.length() + maxUnsigned.length()) < size));

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems like minSigned.length() + maxSigned.length() should be the same as minUnsigned.length() + maxUnsigned.length() right? we could think about dropping one pair from this comparison

Copy link
Author

@a10y a10y Sep 6, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there are cases where that might not be true. Take the following example column:

é
ello
a

minSigned = é
maxSigned = ello

minUnsigned = a
maxUnsigned = é

}

@Override
public String toString() {
if (this.hasNonNullValue())
return String.format("min: %s, max: %s, num_nulls: %d", min.toStringUsingUTF8(), max.toStringUsingUTF8(), this.getNumNulls());
return String.format("min: %s, max: %s, num_nulls: %d", minSigned.toStringUsingUTF8(), maxSigned.toStringUsingUTF8(), this.getNumNulls());
else if (!this.isEmpty())
return String.format("num_nulls: %d, min/max not defined", this.getNumNulls());
else
return "no stats for this column";
}

/**
* @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0
* Tries to update the unsigned min and max to the new potential min_value and max_value.
*/
@Deprecated
public void updateStats(Binary min_value, Binary max_value) {
if (min.compareTo(min_value) > 0) { min = min_value.copy(); }
if (max.compareTo(max_value) < 0) { max = max_value.copy(); }
public void updateStatsUnsigned(Binary min_value, Binary max_value) {
if (Binary.compareTwoBinaryUnsigned(minUnsigned, min_value) > 0) { minUnsigned = min_value.copy(); }
if (Binary.compareTwoBinaryUnsigned(maxUnsigned, max_value) < 0) { maxUnsigned = max_value.copy(); }
}

/**
* @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0
* Tries to update the signed min and max to the new potential min_value and max_value.
*/
@Deprecated
public void initializeStats(Binary min_value, Binary max_value) {
min = min_value.copy();
max = max_value.copy();
this.markAsNotEmpty();
public void updateStatsSigned(Binary min_value, Binary max_value) {
if (minSigned.compareTo(min_value) > 0) { minSigned = min_value.copy(); }
if (maxSigned.compareTo(max_value) < 0) { maxSigned = max_value.copy(); }
}

/**
* Only initialize the unsigned min/max fields.
*/
public void initializeStatsUnsigned(Binary min_value, Binary max_value) {
minUnsigned = min_value.copy();
maxUnsigned = max_value.copy();
this.markAsNotEmpty();
}

/**
* Only initialize the signed min/max fields.
*/
public void initializeStatsSigned(Binary min_value, Binary max_value) {
minSigned = min_value.copy();
maxSigned = max_value.copy();
this.markAsNotEmpty();
}

/**
* For BinaryStatistics use one of genericGetMinSigned() or genericGetMinUnsigned()
*/
@Deprecated
@Override
public Binary genericGetMin() {
return min;
return genericGetMinSigned();
}

/**
* For BinaryStatistics use one of genericGetMaxSigned() or generic getMaxUnsigned()
*/
@Deprecated
@Override
public Binary genericGetMax() {
return max;
return genericGetMaxSigned();
}

@Override
public Binary genericGetMinSigned() {
return minSigned;
}

@Override
public Binary genericGetMaxSigned() {
return maxSigned;
}

@Override
public Binary genericGetMinUnsigned() {
return minUnsigned;
}

@Override
public Binary genericGetMaxUnsigned() {
return maxUnsigned;
}

/**
* @deprecated use {@link #genericGetMax()}, will be removed in 2.0.0
*/
@Deprecated
public Binary getMax() {
return max;
return maxSigned;
}

/**
* @deprecated use {@link #genericGetMin()}, will be removed in 2.0.0
*/
@Deprecated
public Binary getMin() {
return min;
return minSigned;
}

/**
* @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0
*/
@Deprecated
public void setMinMax(Binary min, Binary max) {
this.max = max;
this.min = min;
this.maxSigned = max;
this.minSigned = min;
this.maxUnsigned = max;
this.minUnsigned = min;
this.markAsNotEmpty();
}

@Override
public final int compareValueToMin(Binary value, ByteSignedness signedness) {
if (signedness == ByteSignedness.SIGNED) {
return value.compareTo(genericGetMinSigned());
} else {
return Binary.compareTwoBinaryUnsigned(value, genericGetMinUnsigned());
}
}

@Override
public final int compareValueToMax(Binary value, ByteSignedness signedness) {
if (signedness == ByteSignedness.SIGNED) {
return value.compareTo(genericGetMaxSigned());
} else {
return Binary.compareTwoBinaryUnsigned(value, genericGetMaxUnsigned());
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package org.apache.parquet.column.statistics;

import org.apache.parquet.column.UnknownColumnTypeException;
import org.apache.parquet.filter2.predicate.ByteSignedness;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import java.util.Arrays;
Expand Down Expand Up @@ -115,6 +116,24 @@ public void updateStats(Binary value) {
throw new UnsupportedOperationException();
}

/**
* updates statistics signed_min and signed_max using the
* passed value
* @param value value to use to update signed_min and signed_max
*/
public void updateStatsSigned(Binary value) {
throw new UnsupportedOperationException();
}

/**
* updates statistics unsigned_min and unsigned_max using the
* passed value
* @param value value to use to update unsigned_min and unsigned_max
*/
public void updateStatsUnsigned(Binary value) {
throw new UnsupportedOperationException();
}

/**
* Equality comparison method to compare two statistics objects.
* @param other Object to compare against
Expand Down Expand Up @@ -175,9 +194,49 @@ public void mergeStatistics(Statistics stats) {
*/
abstract public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes);

public void setMinMaxSignedFromBytes(byte[] minBytes, byte[] maxBytes) {
setMinMaxFromBytes(minBytes, maxBytes);
}

public void setMinMaxUnsignedFromBytes(byte[] minBytes, byte[] maxBytes) {
setMinMaxFromBytes(minBytes, maxBytes);
}

abstract public T genericGetMin();
abstract public T genericGetMax();

public T genericGetMinSigned() {
return genericGetMin();
}
public T genericGetMaxSigned() {
return genericGetMax();
}

public T genericGetMinUnsigned() {
return genericGetMin();
}

public T genericGetMaxUnsigned() {
return genericGetMax();
}

public int compareValueToMin(T value, ByteSignedness signedness) {
if (signedness == ByteSignedness.SIGNED) {
return value.compareTo(genericGetMinSigned());
} else {
return value.compareTo(genericGetMinUnsigned());
}
}

public int compareValueToMax(T value, ByteSignedness signedness) {
if (signedness == ByteSignedness.SIGNED) {
return value.compareTo(genericGetMaxSigned());
} else {
return value.compareTo(genericGetMaxUnsigned());
}
}


/**
* Abstract method to return the max value as a byte array
* @return byte array corresponding to the max value
Expand All @@ -190,6 +249,22 @@ public void mergeStatistics(Statistics stats) {
*/
abstract public byte[] getMinBytes();

public byte[] getMinBytesSigned() {
return getMinBytes();
}

public byte[] getMaxBytesSigned() {
return getMaxBytes();
}

public byte[] getMinBytesUnsigned() {
return getMinBytes();
}

public byte[] getMaxBytesUnsigned() {
return getMaxBytes();
}

/**
* Abstract method to return whether the min and max values fit in the given
* size.
Expand Down
Loading