diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java index c319b4adb0..3333a6be50 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java @@ -18,19 +18,49 @@ */ package org.apache.parquet.column.statistics; +import org.apache.parquet.filter2.predicate.ByteSignedness; import org.apache.parquet.io.api.Binary; +/** + * BinaryStatistics: tracks statistics information on binary data columns. + * There are two sets of mins and maxes: those based on signed and unsigned byte comparisons. + * For example, given a set of Binary values {@code Binary.fromString("a")}, {@code Binary.fromString("é")}, + * the signed min will be "é" as the first byte of the codepoint will be larger than 127 + */ public class BinaryStatistics extends Statistics { - private Binary max; - private Binary min; + private Binary maxSigned; + private Binary minSigned; + + private Binary minUnsigned; + private Binary maxUnsigned; @Override public void updateStats(Binary value) { if (!this.hasNonNullValue()) { - initializeStats(value, value); + initializeStatsSigned(value, value); + initializeStatsUnsigned(value, value); + } else { + updateStatsSigned(value, value); + updateStatsUnsigned(value, value); + } + } + + @Override + public void updateStatsSigned(Binary value) { + if (!this.hasNonNullValue()) { + initializeStatsSigned(value, value); + } else { + updateStatsSigned(value, value); + } + } + + @Override + public void updateStatsUnsigned(Binary value) { + if (!this.hasNonNullValue()) { + initializeStatsUnsigned(value, value); } else { - updateStats(value, value); + updateStatsUnsigned(value, value); } } @@ -38,44 +68,87 @@ public void updateStats(Binary value) { public void mergeStatisticsMinMax(Statistics stats) { BinaryStatistics binaryStats = (BinaryStatistics)stats; if (!this.hasNonNullValue()) { - initializeStats(binaryStats.getMin(), binaryStats.getMax()); + initializeStatsSigned(binaryStats.genericGetMinSigned(), binaryStats.genericGetMaxSigned()); + initializeStatsUnsigned(binaryStats.genericGetMinUnsigned(), binaryStats.genericGetMaxUnsigned()); } else { - updateStats(binaryStats.getMin(), binaryStats.getMax()); + updateStatsSigned(binaryStats.genericGetMinSigned(), binaryStats.genericGetMaxSigned()); + updateStatsUnsigned(binaryStats.genericGetMinUnsigned(), binaryStats.genericGetMaxUnsigned()); } } /** - * Sets min and max values, re-uses the byte[] passed in. - * Any changes made to byte[] will be reflected in min and max values as well. - * @param minBytes byte array to set the min value to - * @param maxBytes byte array to set the max value to + * Sets minSigned and maxSigned values, re-uses the byte[] passed in. + * Any changes made to byte[] will be reflected in minSigned and maxSigned values as well. + * @param minBytes byte array to set the minSigned value to + * @param maxBytes byte array to set the maxSigned value to */ @Override public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes) { - max = Binary.fromReusedByteArray(maxBytes); - min = Binary.fromReusedByteArray(minBytes); + maxSigned = Binary.fromReusedByteArray(maxBytes); + minSigned = Binary.fromReusedByteArray(minBytes); + maxUnsigned = maxSigned.copy(); + minUnsigned = minSigned.copy(); this.markAsNotEmpty(); } + @Override + public void setMinMaxSignedFromBytes(byte[] minBytes, byte[] maxBytes) { + this.minSigned = Binary.fromReusedByteArray(minBytes); + this.maxSigned = Binary.fromReusedByteArray(maxBytes); + } + + @Override + public void setMinMaxUnsignedFromBytes(byte[] minBytes, byte[] maxBytes) { + this.minUnsigned = Binary.fromReusedByteArray(minBytes); + this.maxUnsigned = Binary.fromReusedByteArray(maxBytes); + } + + /** + * Use either getMaxBytesSigned() or getMaxBytesUnsigned() directly instead. + */ + @Deprecated @Override public byte[] getMaxBytes() { - return max == null ? null : max.getBytes(); + return getMaxBytesSigned(); } + /** + * Use either getMinBytesSigned() or getMinBytesUnsigned() directly instead. + */ + @Deprecated @Override public byte[] getMinBytes() { - return min == null ? null : min.getBytes(); + return getMinBytesSigned(); + } + + public byte[] getMaxBytesSigned() { + return maxSigned == null ? null : maxSigned.getBytes(); + } + + @Override + public byte[] getMinBytesSigned() { + return minSigned == null ? null : minSigned.getBytes(); + } + + @Override + public byte[] getMaxBytesUnsigned() { + return maxUnsigned == null ? null : maxUnsigned.getBytes(); + } + + @Override + public byte[] getMinBytesUnsigned() { + return minUnsigned == null ? null : minUnsigned.getBytes(); } @Override public boolean isSmallerThan(long size) { - return !hasNonNullValue() || ((min.length() + max.length()) < size); + return !hasNonNullValue() || (((minSigned.length() + maxSigned.length()) < size) && ((minUnsigned.length() + maxUnsigned.length()) < size)); } @Override public String toString() { if (this.hasNonNullValue()) - return String.format("min: %s, max: %s, num_nulls: %d", min.toStringUsingUTF8(), max.toStringUsingUTF8(), this.getNumNulls()); + return String.format("min: %s, max: %s, num_nulls: %d", minSigned.toStringUsingUTF8(), maxSigned.toStringUsingUTF8(), this.getNumNulls()); else if (!this.isEmpty()) return String.format("num_nulls: %d, min/max not defined", this.getNumNulls()); else @@ -83,32 +156,75 @@ else if (!this.isEmpty()) } /** - * @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0 + * Tries to update the unsigned min and max to the new potential min_value and max_value. */ - @Deprecated - public void updateStats(Binary min_value, Binary max_value) { - if (min.compareTo(min_value) > 0) { min = min_value.copy(); } - if (max.compareTo(max_value) < 0) { max = max_value.copy(); } + public void updateStatsUnsigned(Binary min_value, Binary max_value) { + if (Binary.compareTwoBinaryUnsigned(minUnsigned, min_value) > 0) { minUnsigned = min_value.copy(); } + if (Binary.compareTwoBinaryUnsigned(maxUnsigned, max_value) < 0) { maxUnsigned = max_value.copy(); } } /** - * @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0 + * Tries to update the signed min and max to the new potential min_value and max_value. */ - @Deprecated - public void initializeStats(Binary min_value, Binary max_value) { - min = min_value.copy(); - max = max_value.copy(); - this.markAsNotEmpty(); + public void updateStatsSigned(Binary min_value, Binary max_value) { + if (minSigned.compareTo(min_value) > 0) { minSigned = min_value.copy(); } + if (maxSigned.compareTo(max_value) < 0) { maxSigned = max_value.copy(); } + } + + /** + * Only initialize the unsigned min/max fields. + */ + public void initializeStatsUnsigned(Binary min_value, Binary max_value) { + minUnsigned = min_value.copy(); + maxUnsigned = max_value.copy(); + this.markAsNotEmpty(); + } + + /** + * Only initialize the signed min/max fields. + */ + public void initializeStatsSigned(Binary min_value, Binary max_value) { + minSigned = min_value.copy(); + maxSigned = max_value.copy(); + this.markAsNotEmpty(); } + /** + * For BinaryStatistics use one of genericGetMinSigned() or genericGetMinUnsigned() + */ + @Deprecated @Override public Binary genericGetMin() { - return min; + return genericGetMinSigned(); } + /** + * For BinaryStatistics use one of genericGetMaxSigned() or generic getMaxUnsigned() + */ + @Deprecated @Override public Binary genericGetMax() { - return max; + return genericGetMaxSigned(); + } + + @Override + public Binary genericGetMinSigned() { + return minSigned; + } + + @Override + public Binary genericGetMaxSigned() { + return maxSigned; + } + + @Override + public Binary genericGetMinUnsigned() { + return minUnsigned; + } + + @Override + public Binary genericGetMaxUnsigned() { + return maxUnsigned; } /** @@ -116,7 +232,7 @@ public Binary genericGetMax() { */ @Deprecated public Binary getMax() { - return max; + return maxSigned; } /** @@ -124,7 +240,7 @@ public Binary getMax() { */ @Deprecated public Binary getMin() { - return min; + return minSigned; } /** @@ -132,8 +248,29 @@ public Binary getMin() { */ @Deprecated public void setMinMax(Binary min, Binary max) { - this.max = max; - this.min = min; + this.maxSigned = max; + this.minSigned = min; + this.maxUnsigned = max; + this.minUnsigned = min; this.markAsNotEmpty(); } + + @Override + public final int compareValueToMin(Binary value, ByteSignedness signedness) { + if (signedness == ByteSignedness.SIGNED) { + return value.compareTo(genericGetMinSigned()); + } else { + return Binary.compareTwoBinaryUnsigned(value, genericGetMinUnsigned()); + } + } + + @Override + public final int compareValueToMax(Binary value, ByteSignedness signedness) { + if (signedness == ByteSignedness.SIGNED) { + return value.compareTo(genericGetMaxSigned()); + } else { + return Binary.compareTwoBinaryUnsigned(value, genericGetMaxUnsigned()); + } + } + } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java index 30153c0743..d418d0fa62 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java @@ -19,6 +19,7 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.column.UnknownColumnTypeException; +import org.apache.parquet.filter2.predicate.ByteSignedness; import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import java.util.Arrays; @@ -115,6 +116,24 @@ public void updateStats(Binary value) { throw new UnsupportedOperationException(); } + /** + * updates statistics signed_min and signed_max using the + * passed value + * @param value value to use to update signed_min and signed_max + */ + public void updateStatsSigned(Binary value) { + throw new UnsupportedOperationException(); + } + + /** + * updates statistics unsigned_min and unsigned_max using the + * passed value + * @param value value to use to update unsigned_min and unsigned_max + */ + public void updateStatsUnsigned(Binary value) { + throw new UnsupportedOperationException(); + } + /** * Equality comparison method to compare two statistics objects. * @param other Object to compare against @@ -175,9 +194,49 @@ public void mergeStatistics(Statistics stats) { */ abstract public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes); + public void setMinMaxSignedFromBytes(byte[] minBytes, byte[] maxBytes) { + setMinMaxFromBytes(minBytes, maxBytes); + } + + public void setMinMaxUnsignedFromBytes(byte[] minBytes, byte[] maxBytes) { + setMinMaxFromBytes(minBytes, maxBytes); + } + abstract public T genericGetMin(); abstract public T genericGetMax(); + public T genericGetMinSigned() { + return genericGetMin(); + } + public T genericGetMaxSigned() { + return genericGetMax(); + } + + public T genericGetMinUnsigned() { + return genericGetMin(); + } + + public T genericGetMaxUnsigned() { + return genericGetMax(); + } + + public int compareValueToMin(T value, ByteSignedness signedness) { + if (signedness == ByteSignedness.SIGNED) { + return value.compareTo(genericGetMinSigned()); + } else { + return value.compareTo(genericGetMinUnsigned()); + } + } + + public int compareValueToMax(T value, ByteSignedness signedness) { + if (signedness == ByteSignedness.SIGNED) { + return value.compareTo(genericGetMaxSigned()); + } else { + return value.compareTo(genericGetMaxUnsigned()); + } + } + + /** * Abstract method to return the max value as a byte array * @return byte array corresponding to the max value @@ -190,6 +249,22 @@ public void mergeStatistics(Statistics stats) { */ abstract public byte[] getMinBytes(); + public byte[] getMinBytesSigned() { + return getMinBytes(); + } + + public byte[] getMaxBytesSigned() { + return getMaxBytes(); + } + + public byte[] getMinBytesUnsigned() { + return getMinBytes(); + } + + public byte[] getMaxBytesUnsigned() { + return getMaxBytes(); + } + /** * Abstract method to return whether the min and max values fit in the given * size. diff --git a/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/ByteSignedness.java b/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/ByteSignedness.java new file mode 100644 index 0000000000..ca865359ea --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/ByteSignedness.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.filter2.predicate; + +/** + * The way bytes should be interpreted, either as signed 8-bit values or as unsigned 8-bit values. + */ +public enum ByteSignedness { + SIGNED, + UNSIGNED +} diff --git a/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/FilterApi.java b/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/FilterApi.java index b73e59c8e7..bd90fa1c42 100644 --- a/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/FilterApi.java +++ b/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/FilterApi.java @@ -102,6 +102,13 @@ public static , C extends Column & SupportsEqNotEq> E return new Eq(column, value); } + /** + * Same as {@link FilterApi#eq(Column, Comparable)}, but allows specifying signedness. + */ + public static , C extends Column & SupportsEqNotEq> Eq eq(C column, T value, ByteSignedness signedness) { + return new Eq(column, value, signedness); + } + /** * Keeps records if their value is not equal to the provided value. * Nulls are treated the same way the java programming language does. @@ -120,6 +127,13 @@ public static , C extends Column & SupportsEqNotEq> N return new NotEq(column, value); } + /** + * Same as {@link FilterApi#notEq(Column, Comparable)}, but allows specifying signedness. + */ + public static , C extends Column & SupportsEqNotEq> NotEq notEq(C column, T value, ByteSignedness signedness) { + return new NotEq(column, value, signedness); + } + /** * Keeps records if their value is less than (but not equal to) the provided value. * The provided value cannot be null, as less than null has no meaning. @@ -131,6 +145,14 @@ public static , C extends Column & SupportsLtGt> Lt(column, value); } + /** + * Same as {@link FilterApi#lt(Column, Comparable)}, but allows specifying signedness. + */ + public static , C extends Column & SupportsLtGt> Lt lt(C column, T value, ByteSignedness signedness) { + return new Lt(column, value, signedness); + } + + /** * Keeps records if their value is less than or equal to the provided value. * The provided value cannot be null, as less than null has no meaning. @@ -142,6 +164,13 @@ public static , C extends Column & SupportsLtGt> LtEq return new LtEq(column, value); } + /** + * Same as {@link FilterApi#ltEq(Column, Comparable)}, but allows specifying signedness. + */ + public static , C extends Column & SupportsLtGt> LtEq ltEq(C column, T value, ByteSignedness signedness) { + return new LtEq(column, value, signedness); + } + /** * Keeps records if their value is greater than (but not equal to) the provided value. * The provided value cannot be null, as less than null has no meaning. @@ -153,6 +182,13 @@ public static , C extends Column & SupportsLtGt> Gt(column, value); } + /** + * Same as {@link FilterApi#gt(Column, Comparable)}, but allows specifying signedness. + */ + public static , C extends Column & SupportsLtGt> Gt gt(C column, T value, ByteSignedness signedness) { + return new Gt(column, value, signedness); + } + /** * Keeps records if their value is greater than or equal to the provided value. * The provided value cannot be null, as less than null has no meaning. @@ -164,6 +200,13 @@ public static , C extends Column & SupportsLtGt> GtEq return new GtEq(column, value); } + /** + * Same as {@link FilterApi#gtEq(Column, Comparable)}, but allows specifying signedness. + */ + public static , C extends Column & SupportsLtGt> GtEq gtEq(C column, T value, ByteSignedness signedness) { + return new GtEq(column, value, signedness); + } + /** * Keeps records that pass the provided {@link UserDefinedPredicate} * diff --git a/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Operators.java b/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Operators.java index eca0f6700a..222ca57257 100644 --- a/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Operators.java +++ b/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Operators.java @@ -122,8 +122,14 @@ static abstract class ColumnFilterPredicate> implements private final Column column; private final T value; private final String toString; + private final ByteSignedness signedness; protected ColumnFilterPredicate(Column column, T value) { + // Default is to assume signed comparisons when not specified. + this(column, value, ByteSignedness.SIGNED); + } + + protected ColumnFilterPredicate(Column column, T value, ByteSignedness signedness) { this.column = checkNotNull(column, "column"); // Eq and NotEq allow value to be null, Lt, Gt, LtEq, GtEq however do not, so they guard against @@ -132,6 +138,8 @@ protected ColumnFilterPredicate(Column column, T value) { String name = getClass().getSimpleName().toLowerCase(Locale.ENGLISH); this.toString = name + "(" + column.getColumnPath().toDotString() + ", " + value + ")"; + + this.signedness = signedness; } public Column getColumn() { @@ -142,6 +150,10 @@ public T getValue() { return value; } + public ByteSignedness getSignedness() { + return signedness; + } + @Override public String toString() { return toString; @@ -176,6 +188,11 @@ public static final class Eq> extends ColumnFilterPredic super(column, value); } + // value can be null + Eq(Column column, T value, ByteSignedness signedness) { + super(column, value, signedness); + } + @Override public R accept(Visitor visitor) { return visitor.visit(this); @@ -190,6 +207,12 @@ public static final class NotEq> extends ColumnFilterPre super(column, value); } + // value can be null + NotEq(Column column, T value, ByteSignedness signedness) { + super(column, value, signedness); + } + + @Override public R accept(Visitor visitor) { return visitor.visit(this); @@ -204,6 +227,11 @@ public static final class Lt> extends ColumnFilterPredic super(column, checkNotNull(value, "value")); } + // value cannot be null + Lt(Column column, T value, ByteSignedness signedness) { + super(column, checkNotNull(value, "value"), signedness); + } + @Override public R accept(Visitor visitor) { return visitor.visit(this); @@ -217,6 +245,11 @@ public static final class LtEq> extends ColumnFilterPred super(column, checkNotNull(value, "value")); } + // value cannot be null + LtEq(Column column, T value, ByteSignedness signedness) { + super(column, checkNotNull(value, "value"), signedness); + } + @Override public R accept(Visitor visitor) { return visitor.visit(this); @@ -231,6 +264,12 @@ public static final class Gt> extends ColumnFilterPredic super(column, checkNotNull(value, "value")); } + // value cannot be null + Gt(Column column, T value, ByteSignedness signedness) { + super(column, checkNotNull(value, "value"), signedness); + } + + @Override public R accept(Visitor visitor) { return visitor.visit(this); @@ -244,6 +283,11 @@ public static final class GtEq> extends ColumnFilterPred super(column, checkNotNull(value, "value")); } + // value cannot be null + GtEq(Column column, T value, ByteSignedness signedness) { + super(column, checkNotNull(value, "value"), signedness); + } + @Override public R accept(Visitor visitor) { return visitor.visit(this); diff --git a/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java b/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java index 50b98c202e..c089c91df9 100644 --- a/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java +++ b/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java @@ -194,12 +194,12 @@ public int compareTo(Binary other) { @Override int compareTo(byte[] other, int otherOffset, int otherLength) { - return Binary.compareTwoByteArrays(value, offset, length, other, otherOffset, otherLength); + return Binary.compareTwoByteArraysSigned(value, offset, length, other, otherOffset, otherLength); } @Override int compareTo(ByteBuffer bytes, int otherOffset, int otherLength) { - return Binary.compareByteArrayToByteBuffer(value, offset, length, bytes, otherOffset, otherLength); + return Binary.compareByteArrayToByteBufferSigned(value, offset, length, bytes, otherOffset, otherLength); } @Override @@ -350,12 +350,12 @@ public int compareTo(Binary other) { @Override int compareTo(byte[] other, int otherOffset, int otherLength) { - return Binary.compareTwoByteArrays(value, 0, value.length, other, otherOffset, otherLength); + return Binary.compareTwoByteArraysSigned(value, 0, value.length, other, otherOffset, otherLength); } @Override int compareTo(ByteBuffer bytes, int otherOffset, int otherLength) { - return Binary.compareByteArrayToByteBuffer(value, 0, value.length, bytes, otherOffset, otherLength); + return Binary.compareByteArrayToByteBufferSigned(value, 0, value.length, bytes, otherOffset, otherLength); } @Override @@ -515,16 +515,16 @@ public int compareTo(Binary other) { @Override int compareTo(byte[] other, int otherOffset, int otherLength) { if (value.hasArray()) { - return Binary.compareTwoByteArrays(value.array(), value.arrayOffset() + offset, length, + return Binary.compareTwoByteArraysSigned(value.array(), value.arrayOffset() + offset, length, other, otherOffset, otherLength); } { - return Binary.compareByteBufferToByteArray(value, offset, length, other, otherOffset, otherLength); + return Binary.compareByteBufferToByteArraySigned(value, offset, length, other, otherOffset, otherLength); } } @Override int compareTo(ByteBuffer bytes, int otherOffset, int otherLength) { - return Binary.compareTwoByteBuffers(value, offset, length, bytes, otherOffset, otherLength); + return Binary.compareTwoByteBuffersSigned(value, offset, length, bytes, otherOffset, otherLength); } @Override @@ -666,63 +666,73 @@ private static final boolean equals(byte[] array1, int offset1, int length1, byt return true; } - private static final int compareByteBufferToByteArray(ByteBuffer buf, int offset1, int length1, - byte[] array, int offset2, int length2) { - return -1 * Binary.compareByteArrayToByteBuffer(array, offset1, length1, buf, offset2, length2); + private static final int compareByteBufferToByteArraySigned(ByteBuffer buf, int offset1, int length1, + byte[] array, int offset2, int length2) { + return -1 * Binary.compareByteArrayToByteBufferSigned(array, offset1, length1, buf, offset2, length2); } - private static final int compareByteArrayToByteBuffer(byte[] array1, int offset1, int length1, - ByteBuffer buf, int offset2, int length2) { + private static final int compareByteArrayToByteBufferSigned(byte[] array1, int offset1, int length1, + ByteBuffer buf, int offset2, int length2) { if (array1 == null && buf == null) return 0; int min_length = (length1 < length2) ? length1 : length2; for (int i = 0; i < min_length; i++) { - if (array1[i + offset1] < buf.get(i + offset2)) { - return 1; - } - if (array1[i + offset1] > buf.get(i + offset2)) { - return -1; + int value1 = array1[i + offset1]; + int value2 = buf.get(i + offset2); + if (value1 != value2) { + return value2 - value1; } } // check remainder - if (length1 == length2) { return 0; } - else if (length1 < length2) { return 1;} - else { return -1; } + return length2 - length1; } - private static final int compareTwoByteBuffers(ByteBuffer buf1, int offset1, int length1, - ByteBuffer buf2, int offset2, int length2) { + private static final int compareTwoByteBuffersSigned(ByteBuffer buf1, int offset1, int length1, + ByteBuffer buf2, int offset2, int length2) { if (buf1 == null && buf2 == null) return 0; int min_length = (length1 < length2) ? length1 : length2; for (int i = 0; i < min_length; i++) { - if (buf1.get(i + offset1) < buf2.get(i + offset2)) { - return 1; - } - if (buf1.get(i + offset1) > buf2.get(i + offset2)) { - return -1; + int value1 = buf1.get(i + offset1); + int value2 = buf2.get(i + offset2); + if (value1 != value2) { + return value2 - value1; } } // check remainder - if (length1 == length2) { return 0; } - else if (length1 < length2) { return 1;} - else { return -1; } + return length2 - length1; } - private static final int compareTwoByteArrays(byte[] array1, int offset1, int length1, - byte[] array2, int offset2, int length2) { + private static final int compareTwoByteArraysSigned(byte[] array1, int offset1, int length1, + byte[] array2, int offset2, int length2) { if (array1 == null && array2 == null) return 0; if (array1 == array2 && offset1 == offset2 && length1 == length2) return 0; int min_length = (length1 < length2) ? length1 : length2; for (int i = 0; i < min_length; i++) { - if (array1[i + offset1] < array2[i + offset2]) { - return 1; + int value1 = array1[i + offset1]; + int value2 = array2[i + offset2]; + if (value1 != value2) { + return value2 - value1; } - if (array1[i + offset1] > array2[i + offset2]) { - return -1; + } + // check remainder + return length2 - length1; + } + + public static final int compareTwoBinaryUnsigned(Binary first, Binary second) { + byte[] array1 = first.getBytes(); + byte[] array2 = second.getBytes(); + if ((array1 == null) && (array2 == null)) return 0; + if (array1 == array2) return 0; + int length1 = array1.length; + int length2 = array2.length; + int min_length = (length1 < length2) ? length1 : length2; + for (int i = 0; i < min_length; i++) { + int value1 = array1[i] & 0xFF; + int value2 = array2[i] & 0xFF; + if (value1 != value2) { + return value1 - value2; } } // check remainder - if (length1 == length2) { return 0; } - else if (length1 < length2) { return 1;} - else { return -1; } + return length1 - length2; } } diff --git a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java index 128acb49f6..e863c34f54 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java @@ -383,6 +383,19 @@ public void testBinaryMinMax() { assertEquals(stats.toString(), "min: a, max: world, num_nulls: 0"); } + @Test + public void testBinaryMinMaxUnsigned() { + stringArray = new String[] {"é", "a", "b", "c"}; + BinaryStatistics stats = new BinaryStatistics(); + + for (String s: stringArray) { + stats.updateStats(Binary.fromString(s)); + } + + assertEquals(stats.genericGetMaxUnsigned(), Binary.fromString("é")); + assertEquals(stats.genericGetMinUnsigned(), Binary.fromString("a")); + } + @Test public void testBinaryMinMaxForReusedBackingByteArray() { BinaryStatistics stats = new BinaryStatistics(); diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/compat/RowGroupFilter.java b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/compat/RowGroupFilter.java index fd74799aaa..2034f96284 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/compat/RowGroupFilter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/compat/RowGroupFilter.java @@ -71,9 +71,9 @@ private RowGroupFilter(List blocks, MessageType schema) { private RowGroupFilter(List levels, List blocks, ParquetFileReader reader) { this.blocks = checkNotNull(blocks, "blocks"); - this.reader = checkNotNull(reader, "reader"); this.schema = reader.getFileMetaData().getSchema(); this.levels = levels; + this.reader = checkNotNull(reader, "reader"); } @Override diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java index b37297aaaf..d3a8d6e286 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java @@ -23,6 +23,7 @@ import java.util.Map; import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.filter2.predicate.ByteSignedness; import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.filter2.predicate.FilterPredicate; import org.apache.parquet.filter2.predicate.Operators.And; @@ -40,7 +41,6 @@ import org.apache.parquet.filter2.predicate.UserDefinedPredicate; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; -import static org.apache.parquet.Preconditions.checkArgument; import static org.apache.parquet.Preconditions.checkNotNull; /** @@ -134,7 +134,8 @@ public > Boolean visit(Eq eq) { } // drop if value < min || value > max - return value.compareTo(stats.genericGetMin()) < 0 || value.compareTo(stats.genericGetMax()) > 0; + ByteSignedness signedness = eq.getSignedness(); + return stats.compareValueToMin(value, signedness) < 0 || stats.compareValueToMax(value, signedness) > 0; } @Override @@ -173,7 +174,8 @@ public > Boolean visit(NotEq notEq) { } // drop if this is a column where min = max = value - return value.compareTo(stats.genericGetMin()) == 0 && value.compareTo(stats.genericGetMax()) == 0; + ByteSignedness signedness = notEq.getSignedness(); + return stats.compareValueToMin(value, signedness) == 0 && stats.compareValueToMax(value, signedness) == 0; } @Override @@ -204,7 +206,8 @@ public > Boolean visit(Lt lt) { T value = lt.getValue(); // drop if value <= min - return value.compareTo(stats.genericGetMin()) <= 0; + ByteSignedness signedness = lt.getSignedness(); + return stats.compareValueToMin(value, signedness) <= 0; } @Override @@ -235,7 +238,8 @@ public > Boolean visit(LtEq ltEq) { T value = ltEq.getValue(); // drop if value < min - return value.compareTo(stats.genericGetMin()) < 0; + ByteSignedness signedness = ltEq.getSignedness(); + return stats.compareValueToMin(value, signedness) < 0; } @Override @@ -266,7 +270,8 @@ public > Boolean visit(Gt gt) { T value = gt.getValue(); // drop if value >= max - return value.compareTo(stats.genericGetMax()) >= 0; + ByteSignedness signedness = gt.getSignedness(); + return stats.compareValueToMax(value, signedness) >= 0; } @Override @@ -297,7 +302,8 @@ public > Boolean visit(GtEq gtEq) { T value = gtEq.getValue(); // drop if value >= max - return value.compareTo(stats.genericGetMax()) > 0; + ByteSignedness signedness = gtEq.getSignedness(); + return stats.compareValueToMax(value, signedness) > 0; } @Override diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 9eb471f26c..b0778ef4f3 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -38,6 +38,7 @@ import org.apache.parquet.CorruptStatistics; import org.apache.parquet.Log; +import org.apache.parquet.column.statistics.BinaryStatistics; import org.apache.parquet.format.PageEncodingStats; import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.format.ColumnChunk; @@ -295,6 +296,12 @@ public static Statistics toParquetStatistics( stats.setMax(statistics.getMaxBytes()); stats.setMin(statistics.getMinBytes()); } + // In newer versions of the format, we allow columns to distinguish between a signed and unsigned interpretation. + // Add these fields to make them available to the reader. + stats.setUnsigned_max(statistics.getMaxBytesUnsigned()); + stats.setUnsigned_min(statistics.getMinBytesUnsigned()); + stats.setSigned_max(statistics.getMaxBytesSigned()); + stats.setSigned_min(statistics.getMinBytesSigned()); } return stats; } @@ -319,6 +326,18 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist if (statistics.isSetMax() && statistics.isSetMin()) { stats.setMinMaxFromBytes(statistics.min.array(), statistics.max.array()); } + // If the signed_min and signed_max are set, we use those, otherwise we fall back to min and max fields. + if (statistics.isSetSigned_max() && statistics.isSetSigned_min()) { + stats.setMinMaxSignedFromBytes(statistics.signed_min.array(), statistics.signed_max.array()); + } else if (statistics.isSetMax() && statistics.isSetMin()) { + stats.setMinMaxSignedFromBytes(statistics.min.array(), statistics.max.array()); + } + // We use unsigned_min and unsigned_max if available, otherwise we once again default to the signed min and max. + if (statistics.isSetUnsigned_max() && statistics.isSetUnsigned_min()) { + stats.setMinMaxUnsignedFromBytes(statistics.unsigned_min.array(), statistics.unsigned_max.array()); + } else if (statistics.isSetMax() && statistics.isSetMin()) { + stats.setMinMaxUnsignedFromBytes(statistics.min.array(), statistics.max.array()); + } stats.setNumNulls(statistics.null_count); } return stats; diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java index b47ed694a8..0db6ddf1c4 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -22,6 +22,7 @@ import java.util.HashSet; import java.util.List; +import org.apache.parquet.column.statistics.BinaryStatistics; import org.apache.parquet.io.api.Binary; import org.junit.Test; @@ -29,6 +30,7 @@ import org.apache.parquet.column.statistics.DoubleStatistics; import org.apache.parquet.column.statistics.IntStatistics; import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.filter2.predicate.ByteSignedness; import org.apache.parquet.filter2.predicate.FilterPredicate; import org.apache.parquet.filter2.predicate.LogicalInverseRewriter; import org.apache.parquet.filter2.predicate.Operators.BinaryColumn; @@ -80,17 +82,31 @@ private static ColumnChunkMetaData getDoubleColumnMeta(DoubleStatistics stats, l 0L, 0L, valueCount, 0L, 0L); } + private static ColumnChunkMetaData getBinaryColumnMeta(BinaryStatistics stats, long valueCount) { + return ColumnChunkMetaData.get(ColumnPath.get("binary", "column"), + PrimitiveTypeName.BINARY, + CompressionCodecName.GZIP, + new HashSet(Arrays.asList(Encoding.PLAIN)), + stats, + 0L, 0L, valueCount, 0L, 0L); + } + private static final IntColumn intColumn = intColumn("int.column"); private static final DoubleColumn doubleColumn = doubleColumn("double.column"); private static final BinaryColumn missingColumn = binaryColumn("missing"); + private static final BinaryColumn binaryColumn = binaryColumn("binary.column"); private static final IntStatistics intStats = new IntStatistics(); private static final IntStatistics nullIntStats = new IntStatistics(); private static final DoubleStatistics doubleStats = new DoubleStatistics(); + private static final BinaryStatistics binaryStats = new BinaryStatistics(); static { intStats.setMinMax(10, 100); doubleStats.setMinMax(10, 100); + // Statistics should be correct here + binaryStats.updateStats(Binary.fromString("b")); + binaryStats.updateStats(Binary.fromString("é")); nullIntStats.setMinMax(0, 0); nullIntStats.setNumNulls(177); @@ -98,13 +114,13 @@ private static ColumnChunkMetaData getDoubleColumnMeta(DoubleStatistics stats, l private static final List columnMetas = Arrays.asList( getIntColumnMeta(intStats, 177L), - getDoubleColumnMeta(doubleStats, 177L)); + getDoubleColumnMeta(doubleStats, 177L), + getBinaryColumnMeta(binaryStats, 177L)); private static final List nullColumnMetas = Arrays.asList( getIntColumnMeta(nullIntStats, 177L), // column of all nulls getDoubleColumnMeta(doubleStats, 177L)); - @Test public void testEqNonNull() { assertTrue(canDrop(eq(intColumn, 9), columnMetas)); @@ -112,6 +128,11 @@ public void testEqNonNull() { assertFalse(canDrop(eq(intColumn, 100), columnMetas)); assertTrue(canDrop(eq(intColumn, 101), columnMetas)); + // Won't be able to drop "a" due to signed comparison against "é" + assertFalse(canDrop(eq(binaryColumn, fromString("a")), columnMetas)); + // Will drop when bytes are compared unsigned + assertTrue(canDrop(eq(binaryColumn, fromString("a"), ByteSignedness.UNSIGNED), columnMetas)); + // drop columns of all nulls when looking for non-null value assertTrue(canDrop(eq(intColumn, 0), nullColumnMetas)); assertTrue(canDrop(eq(missingColumn, fromString("any")), columnMetas)); @@ -204,6 +225,11 @@ public void testLt() { assertTrue(canDrop(lt(intColumn, 7), nullColumnMetas)); assertTrue(canDrop(lt(missingColumn, fromString("any")), columnMetas)); + + // When comparing signed, "b" cannot be ruled out + assertFalse(canDrop(lt(binaryColumn, fromString("b")), columnMetas)); + // When unsigned it is dropped because "b" = min + assertTrue(canDrop(lt(binaryColumn, fromString("b"), ByteSignedness.UNSIGNED), columnMetas)); } @Test @@ -217,6 +243,11 @@ public void testLtEq() { assertTrue(canDrop(ltEq(intColumn, 7), nullColumnMetas)); assertTrue(canDrop(ltEq(missingColumn, fromString("any")), columnMetas)); + + // When comparing signed, "a" cannot be ruled out because "a" > "é" + assertFalse(canDrop(ltEq(binaryColumn, fromString("a")), columnMetas)); + // When unsigned "a" can be dropped + assertTrue(canDrop(ltEq(binaryColumn, fromString("a"), ByteSignedness.UNSIGNED), columnMetas)); } @Test @@ -230,6 +261,13 @@ public void testGt() { assertTrue(canDrop(gt(intColumn, 7), nullColumnMetas)); assertTrue(canDrop(gt(missingColumn, fromString("any")), columnMetas)); + + // When comparing signed, "binary.column > b" will drop this group + assertTrue(canDrop(gt(binaryColumn, fromString("b")), columnMetas)); + assertFalse(canDrop(gt(binaryColumn, fromString("é")), columnMetas)); + // Flipped when unsigned + assertFalse(canDrop(gt(binaryColumn, fromString("b"), ByteSignedness.UNSIGNED), columnMetas)); + assertTrue(canDrop(gt(binaryColumn, fromString("é"), ByteSignedness.UNSIGNED), columnMetas)); } @Test @@ -243,6 +281,11 @@ public void testGtEq() { assertTrue(canDrop(gtEq(intColumn, 7), nullColumnMetas)); assertTrue(canDrop(gtEq(missingColumn, fromString("any")), columnMetas)); + + // "binary.column >= b" cannot rule out b when signed, can when unsigned + assertFalse(canDrop(gtEq(binaryColumn, fromString("b")), columnMetas)); + assertFalse(canDrop(gtEq(binaryColumn, fromString("é" + 1)), columnMetas)); + assertTrue(canDrop(gtEq(binaryColumn, fromString("é" + 1), ByteSignedness.UNSIGNED), columnMetas)); } @Test diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index 3c888c37d9..a825e6d327 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -405,6 +405,40 @@ public void testBinaryStats() { Version.FULL_VERSION, formatStats, PrimitiveTypeName.BINARY); Assert.assertTrue(roundTripStats.isEmpty()); + + // Test conversion back and forth. + stats = new BinaryStatistics(); + min = new byte[] {-1, 0, 0}; + max = new byte[] {1, 0, 0}; + stats.updateStats(Binary.fromConstantByteArray(min)); + stats.updateStats(Binary.fromConstantByteArray(max)); + formatStats = ParquetMetadataConverter.toParquetStatistics(stats); + Assert.assertTrue(formatStats.isSetSigned_max()); + Assert.assertTrue(formatStats.isSetSigned_min()); + Assert.assertTrue(formatStats.isSetUnsigned_max()); + Assert.assertTrue(formatStats.isSetUnsigned_min()); + stats = (BinaryStatistics) ParquetMetadataConverter + .fromParquetStatistics("test-thing version 1.0", formatStats, PrimitiveTypeName.BINARY); + Assert.assertArrayEquals(min, stats.getMinBytes()); + Assert.assertArrayEquals(max, stats.getMaxBytes()); + Assert.assertArrayEquals(min, stats.getMinBytesSigned()); + Assert.assertArrayEquals(max, stats.getMaxBytesSigned()); + Assert.assertArrayEquals(min, stats.getMaxBytesUnsigned()); + Assert.assertArrayEquals(max, stats.getMinBytesUnsigned()); + + // Backwards compatibility check: min and max should take the place of signed min/max and unsigned min/max + // when those aren't specified. + org.apache.parquet.format.Statistics stats2 = new org.apache.parquet.format.Statistics(); + stats2.setMin(min); + stats2.setMax(max); + BinaryStatistics minMaxOnlyStats = (BinaryStatistics) ParquetMetadataConverter + .fromParquetStatistics("test-thing version 1.0", stats2, PrimitiveTypeName.BINARY); + Assert.assertArrayEquals(min, minMaxOnlyStats.getMinBytes()); + Assert.assertArrayEquals(max, minMaxOnlyStats.getMaxBytes()); + Assert.assertArrayEquals(min, minMaxOnlyStats.getMinBytesSigned()); + Assert.assertArrayEquals(max, minMaxOnlyStats.getMaxBytesSigned()); + Assert.assertArrayEquals(min, minMaxOnlyStats.getMinBytesUnsigned()); + Assert.assertArrayEquals(max, minMaxOnlyStats.getMaxBytesUnsigned()); } @Test diff --git a/pom.xml b/pom.xml index ca3430904a..1760eb6826 100644 --- a/pom.xml +++ b/pom.xml @@ -73,7 +73,7 @@ 1.1.0 2.5.3 3.0.3 - 2.3.1 + 2.3.2-SNAPSHOT 1.7.0 thrift 2.10.4