diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Util.java b/parquet-cli/src/main/java/org/apache/parquet/cli/Util.java index 07a5364c64..04b390162b 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/Util.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/Util.java @@ -29,10 +29,6 @@ import org.apache.parquet.column.EncodingStats; import org.apache.parquet.column.statistics.BinaryStatistics; import org.apache.parquet.column.statistics.BooleanStatistics; -import org.apache.parquet.column.statistics.DoubleStatistics; -import org.apache.parquet.column.statistics.FloatStatistics; -import org.apache.parquet.column.statistics.IntStatistics; -import org.apache.parquet.column.statistics.LongStatistics; import org.apache.parquet.column.statistics.Statistics; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.schema.MessageType; @@ -40,7 +36,6 @@ import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Type; import java.nio.charset.StandardCharsets; -import java.util.Locale; import java.util.Set; import static org.apache.parquet.column.Encoding.BIT_PACKED; @@ -96,34 +91,14 @@ public static String minMaxAsString(Statistics stats, OriginalType annotation) { return ""; } // TODO: use original types when showing decimal, timestamp, etc. - if (stats instanceof BooleanStatistics) { - return String.format("%s / %s", - ((BooleanStatistics) stats).getMin(), - ((BooleanStatistics) stats).getMax()); - } else if (stats instanceof IntStatistics) { - return String.format("%d / %d", - ((IntStatistics) stats).getMin(), - ((IntStatistics) stats).getMax()); - } else if (stats instanceof LongStatistics) { - return String.format("%d / %d", - ((LongStatistics) stats).getMin(), - ((LongStatistics) stats).getMax()); - } else if (stats instanceof FloatStatistics) { - return String.format("%f / %f", - ((FloatStatistics) stats).getMin(), - ((FloatStatistics) stats).getMax()); - } else if (stats instanceof DoubleStatistics) { - return String.format("%f / %f", - ((DoubleStatistics) stats).getMin(), - ((DoubleStatistics) stats).getMax()); - } else if (stats instanceof BinaryStatistics) { + if (stats instanceof BinaryStatistics) { byte[] minBytes = stats.getMinBytes(); byte[] maxBytes = stats.getMaxBytes(); return String.format("%s / %s", printable(minBytes, annotation == OriginalType.UTF8, 30), printable(maxBytes, annotation == OriginalType.UTF8, 30)); } else { - throw new RuntimeException("Unknown stats type: " + stats); + return String.format("%s / %s", stats.minAsString(), stats.maxAsString()); } } @@ -134,24 +109,6 @@ public static String toString(Statistics stats, long count, OriginalType annotat // TODO: use original types when showing decimal, timestamp, etc. if (stats instanceof BooleanStatistics) { return String.format("nulls: %d/%d", stats.getNumNulls(), count); - } else if (stats instanceof IntStatistics) { - return String.format("min: %d max: %d nulls: %d/%d", - ((IntStatistics) stats).getMin(), ((IntStatistics) stats).getMax(), - stats.getNumNulls(), count); - } else if (stats instanceof LongStatistics) { - return String.format("min: %d max: %d nulls: %d/%d", - ((LongStatistics) stats).getMin(), ((LongStatistics) stats).getMax(), - stats.getNumNulls(), count); - } else if (stats instanceof FloatStatistics) { - return String.format("min: %f max: %f nulls: %d/%d", - ((FloatStatistics) stats).getMin(), - ((FloatStatistics) stats).getMax(), - stats.getNumNulls(), count); - } else if (stats instanceof DoubleStatistics) { - return String.format("min: %f max: %f nulls: %d/%d", - ((DoubleStatistics) stats).getMin(), - ((DoubleStatistics) stats).getMax(), - stats.getNumNulls(), count); } else if (stats instanceof BinaryStatistics) { byte[] minBytes = stats.getMinBytes(); byte[] maxBytes = stats.getMaxBytes(); @@ -160,7 +117,8 @@ public static String toString(Statistics stats, long count, OriginalType annotat printable(maxBytes, annotation == OriginalType.UTF8, 30), stats.getNumNulls(), count); } else { - throw new RuntimeException("Unknown stats type: " + stats); + return String.format("min: %s max: %s nulls: %d/%d", + stats.minAsString(), stats.maxAsString(), stats.getNumNulls(), count); } } diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/CheckParquet251Command.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/CheckParquet251Command.java index 8f6082122b..fbeebdfba6 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/CheckParquet251Command.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/CheckParquet251Command.java @@ -53,6 +53,7 @@ import javax.annotation.Nullable; import java.io.IOException; import java.util.Arrays; +import java.util.Comparator; import java.util.List; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; @@ -184,9 +185,11 @@ private class StatsValidator> { private final boolean hasNonNull; private final T min; private final T max; + private final Comparator comparator; public StatsValidator(DataPage page) { Statistics stats = getStatisticsFromPageHeader(page); + this.comparator = stats.comparator(); this.hasNonNull = stats.hasNonNullValue(); if (hasNonNull) { this.min = stats.genericGetMin(); @@ -199,10 +202,10 @@ public StatsValidator(DataPage page) { public void validate(T value) { if (hasNonNull) { - if (min.compareTo(value) > 0) { + if (comparator.compare(min, value) > 0) { throw new BadStatsException("Min should be <= all values."); } - if (max.compareTo(value) < 0) { + if (comparator.compare(max, value) < 0) { throw new BadStatsException("Max should be >= all values."); } } @@ -343,8 +346,8 @@ private void validateStatsForPage(DataPage page, DictionaryPage dict, console.debug(String.format( "Validated stats min=%s max=%s nulls=%d for page=%s col=%s", - String.valueOf(stats.genericGetMin()), - String.valueOf(stats.genericGetMax()), stats.getNumNulls(), page, + stats.minAsString(), + stats.maxAsString(), stats.getNumNulls(), page, Arrays.toString(desc.getPath()))); } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/ColumnDescriptor.java b/parquet-column/src/main/java/org/apache/parquet/column/ColumnDescriptor.java index 61f13a2740..5f30cd0901 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/ColumnDescriptor.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/ColumnDescriptor.java @@ -20,7 +20,9 @@ import java.util.Arrays; +import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Type; /** * Describes a column's type as well as its position in its containing schema. @@ -31,8 +33,7 @@ public class ColumnDescriptor implements Comparable { private final String[] path; - private final PrimitiveTypeName type; - private final int typeLength; + private final PrimitiveType type; private final int maxRep; private final int maxDef; @@ -42,8 +43,10 @@ public class ColumnDescriptor implements Comparable { * @param type the type of the field * @param maxRep the maximum repetition level for that path * @param maxDef the maximum definition level for that path + * @deprecated Use {@link #ColumnDescriptor(String[], PrimitiveTypeName, int, int)} */ - public ColumnDescriptor(String[] path, PrimitiveTypeName type, int maxRep, + @Deprecated + public ColumnDescriptor(String[] path, PrimitiveTypeName type, int maxRep, int maxDef) { this(path, type, 0, maxRep, maxDef); } @@ -54,13 +57,23 @@ public ColumnDescriptor(String[] path, PrimitiveTypeName type, int maxRep, * @param type the type of the field * @param maxRep the maximum repetition level for that path * @param maxDef the maximum definition level for that path + * @deprecated Use {@link #ColumnDescriptor(String[], PrimitiveTypeName, int, int)} */ - public ColumnDescriptor(String[] path, PrimitiveTypeName type, + @Deprecated + public ColumnDescriptor(String[] path, PrimitiveTypeName type, int typeLength, int maxRep, int maxDef) { - super(); + this(path, new PrimitiveType(Type.Repetition.OPTIONAL, type, typeLength,""), maxRep, maxDef); + } + + /** + * @param path the path to the leaf field in the schema + * @param type the type of the field + * @param maxRep the maximum repetition level for that path + * @param maxDef the maximum definition level for that path + */ + public ColumnDescriptor(String[] path, PrimitiveType type, int maxRep, int maxDef) { this.path = path; this.type = type; - this.typeLength = typeLength; this.maxRep = maxRep; this.maxDef = maxDef; } @@ -88,16 +101,27 @@ public int getMaxDefinitionLevel() { /** * @return the type of that column + * @deprecated will removed in 2.0.0. Use {@link #getPrimitiveType()} instead. */ + @Deprecated public PrimitiveTypeName getType() { - return type; + return type.getPrimitiveTypeName(); } /** * @return the size of the type + * @deprecated will removed in 2.0.0. Use {@link #getPrimitiveType()} instead. **/ + @Deprecated public int getTypeLength() { - return typeLength; + return type.getTypeLength(); + } + + /** + * @return the primitive type object of the column + */ + public PrimitiveType getPrimitiveType() { + return type; } @Override diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java index c5b3884194..e274c112b5 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV1.java @@ -80,7 +80,7 @@ private void log(Object value, int r, int d) { } private void resetStatistics() { - this.statistics = Statistics.getStatsBasedOnType(this.path.getType()); + this.statistics = Statistics.createStats(this.path.getPrimitiveType()); } /** diff --git a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java index c6fd91b5eb..b50d663b6c 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/impl/ColumnWriterV2.java @@ -77,7 +77,7 @@ private void log(Object value, int r, int d) { } private void resetStatistics() { - this.statistics = Statistics.getStatsBasedOnType(this.path.getType()); + this.statistics = Statistics.createStats(path.getPrimitiveType()); } private void definitionLevel(int definitionLevel) { diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java index c319b4adb0..a68285bc1c 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java @@ -19,12 +19,38 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Types; public class BinaryStatistics extends Statistics { + // A fake type object to be used to generate the proper comparator + private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.BINARY) + .named("fake_binary_type"); + private Binary max; private Binary min; + /** + * @deprecated will be removed in 2.0.0. Use {@link Statistics#createStats(org.apache.parquet.schema.Type)} instead + */ + @Deprecated + public BinaryStatistics() { + this(DEFAULT_FAKE_TYPE); + } + + BinaryStatistics(PrimitiveType type) { + super(type); + } + + private BinaryStatistics(BinaryStatistics other) { + super(other.type()); + if (other.hasNonNullValue()) { + initializeStats(other.min, other.max); + } + setNumNulls(other.getNumNulls()); + } + @Override public void updateStats(Binary value) { if (!this.hasNonNullValue()) { @@ -68,18 +94,14 @@ public byte[] getMinBytes() { } @Override - public boolean isSmallerThan(long size) { - return !hasNonNullValue() || ((min.length() + max.length()) < size); + String toString(Binary value) { + // TODO: have separate toString for different logical types? + return value == null ? "null" : value.toStringUsingUTF8(); } @Override - public String toString() { - if (this.hasNonNullValue()) - return String.format("min: %s, max: %s, num_nulls: %d", min.toStringUsingUTF8(), max.toStringUsingUTF8(), this.getNumNulls()); - else if (!this.isEmpty()) - return String.format("num_nulls: %d, min/max not defined", this.getNumNulls()); - else - return "no stats for this column"; + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || ((min.length() + max.length()) < size); } /** @@ -87,8 +109,8 @@ else if (!this.isEmpty()) */ @Deprecated public void updateStats(Binary min_value, Binary max_value) { - if (min.compareTo(min_value) > 0) { min = min_value.copy(); } - if (max.compareTo(max_value) < 0) { max = max_value.copy(); } + if (comparator().compare(min, min_value) > 0) { min = min_value.copy(); } + if (comparator().compare(max, max_value) < 0) { max = max_value.copy(); } } /** @@ -136,4 +158,9 @@ public void setMinMax(Binary min, Binary max) { this.min = min; this.markAsNotEmpty(); } + + @Override + public BinaryStatistics copy() { + return new BinaryStatistics(this); + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java index 22c23933bd..0e77b61e1b 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BooleanStatistics.java @@ -19,12 +19,38 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Types; public class BooleanStatistics extends Statistics { + // A fake type object to be used to generate the proper comparator + private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.BOOLEAN) + .named("fake_boolean_type"); + private boolean max; private boolean min; + /** + * @deprecated will be removed in 2.0.0. Use {@link Statistics#createStats(org.apache.parquet.schema.Type)} instead + */ + @Deprecated + public BooleanStatistics() { + this(DEFAULT_FAKE_TYPE); + } + + BooleanStatistics(PrimitiveType type) { + super(type); + } + + private BooleanStatistics(BooleanStatistics other) { + super(other.type()); + if (other.hasNonNullValue()) { + initializeStats(other.min, other.max); + } + setNumNulls(other.getNumNulls()); + } + @Override public void updateStats(boolean value) { if (!this.hasNonNullValue()) { @@ -66,19 +92,9 @@ public boolean isSmallerThan(long size) { return !hasNonNullValue() || (2 < size); } - @Override - public String toString() { - if (this.hasNonNullValue()) - return String.format("min: %b, max: %b, num_nulls: %d", min, max, this.getNumNulls()); - else if(!this.isEmpty()) - return String.format("num_nulls: %d, min/max not defined", this.getNumNulls()); - else - return "no stats for this column"; - } - public void updateStats(boolean min_value, boolean max_value) { - if (min && !min_value) { min = min_value; } - if (!max && max_value) { max = max_value; } + if (comparator().compare(min, min_value) > 0) { min = min_value; } + if (comparator().compare(max, max_value) < 0) { max = max_value; } } public void initializeStats(boolean min_value, boolean max_value) { @@ -97,6 +113,14 @@ public Boolean genericGetMax() { return max; } + public int compareMinToValue(boolean value) { + return comparator().compare(min, value); + } + + public int compareMaxToValue(boolean value) { + return comparator().compare(max, value); + } + public boolean getMax() { return max; } @@ -110,4 +134,9 @@ public void setMinMax(boolean min, boolean max) { this.min = min; this.markAsNotEmpty(); } + + @Override + public BooleanStatistics copy() { + return new BooleanStatistics(this); + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java index d67a550a6f..0dd067b717 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/DoubleStatistics.java @@ -19,12 +19,38 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Types; public class DoubleStatistics extends Statistics { + // A fake type object to be used to generate the proper comparator + private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.DOUBLE) + .named("fake_double_type"); + private double max; private double min; + /** + * @deprecated will be removed in 2.0.0. Use {@link Statistics#createStats(org.apache.parquet.schema.Type)} instead + */ + @Deprecated + public DoubleStatistics() { + this(DEFAULT_FAKE_TYPE); + } + + DoubleStatistics(PrimitiveType type) { + super(type); + } + + private DoubleStatistics(DoubleStatistics other) { + super(other.type()); + if (other.hasNonNullValue()) { + initializeStats(other.min, other.max); + } + setNumNulls(other.getNumNulls()); + } + @Override public void updateStats(double value) { if (!this.hasNonNullValue()) { @@ -62,23 +88,18 @@ public byte[] getMinBytes() { } @Override - public boolean isSmallerThan(long size) { - return !hasNonNullValue() || (16 < size); + String toString(Double value) { + return String.format("%.5f", value); } @Override - public String toString() { - if(this.hasNonNullValue()) - return String.format("min: %.5f, max: %.5f, num_nulls: %d", min, max, this.getNumNulls()); - else if (!this.isEmpty()) - return String.format("num_nulls: %d, min/max not defined", this.getNumNulls()); - else - return "no stats for this column"; + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (16 < size); } public void updateStats(double min_value, double max_value) { - if (min_value < min) { min = min_value; } - if (max_value > max) { max = max_value; } + if (comparator().compare(min, min_value) > 0) { min = min_value; } + if (comparator().compare(max, max_value) < 0) { max = max_value; } } public void initializeStats(double min_value, double max_value) { @@ -97,6 +118,14 @@ public Double genericGetMax() { return max; } + public int compareMinToValue(double value) { + return comparator().compare(min, value); + } + + public int compareMaxToValue(double value) { + return comparator().compare(max, value); + } + public double getMax() { return max; } @@ -110,4 +139,9 @@ public void setMinMax(double min, double max) { this.min = min; this.markAsNotEmpty(); } + + @Override + public DoubleStatistics copy() { + return new DoubleStatistics(this); + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java index dffc2077ed..36836c6ff7 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/FloatStatistics.java @@ -19,12 +19,39 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Types; public class FloatStatistics extends Statistics { + // A fake type object to be used to generate the proper comparator + private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.FLOAT) + .named("fake_float_type"); + private float max; private float min; + /** + * @deprecated will be removed in 2.0.0. Use {@link Statistics#createStats(org.apache.parquet.schema.Type)} instead + */ + @Deprecated + public FloatStatistics() { + // Creating a fake primitive type to have the proper comparator + this(DEFAULT_FAKE_TYPE); + } + + FloatStatistics(PrimitiveType type) { + super(type); + } + + private FloatStatistics(FloatStatistics other) { + super(other.type()); + if (other.hasNonNullValue()) { + initializeStats(other.min, other.max); + } + setNumNulls(other.getNumNulls()); + } + @Override public void updateStats(float value) { if (!this.hasNonNullValue()) { @@ -62,23 +89,18 @@ public byte[] getMinBytes() { } @Override - public boolean isSmallerThan(long size) { - return !hasNonNullValue() || (8 < size); + String toString(Float value) { + return String.format("%.5f", value); } @Override - public String toString() { - if (this.hasNonNullValue()) - return String.format("min: %.5f, max: %.5f, num_nulls: %d", min, max, this.getNumNulls()); - else if (!this.isEmpty()) - return String.format("num_nulls: %d, min/max not defined", this.getNumNulls()); - else - return "no stats for this column"; + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (8 < size); } public void updateStats(float min_value, float max_value) { - if (min_value < min) { min = min_value; } - if (max_value > max) { max = max_value; } + if (comparator().compare(min, min_value) > 0) { min = min_value; } + if (comparator().compare(max, max_value) < 0) { max = max_value; } } public void initializeStats(float min_value, float max_value) { @@ -97,6 +119,14 @@ public Float genericGetMax() { return max; } + public int compareMinToValue(float value) { + return comparator().compare(min, value); + } + + public int compareMaxToValue(float value) { + return comparator().compare(max, value); + } + public float getMax() { return max; } @@ -110,4 +140,9 @@ public void setMinMax(float min, float max) { this.min = min; this.markAsNotEmpty(); } + + @Override + public FloatStatistics copy() { + return new FloatStatistics(this); + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java index a5d7ba196e..5df7f0a7c6 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/IntStatistics.java @@ -19,12 +19,38 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Types; public class IntStatistics extends Statistics { + // A fake type object to be used to generate the proper comparator + private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.INT32) + .named("fake_int32_type"); + private int max; private int min; + /** + * @deprecated will be removed in 2.0.0. Use {@link Statistics#createStats(org.apache.parquet.schema.Type)} instead + */ + @Deprecated + public IntStatistics() { + this(DEFAULT_FAKE_TYPE); + } + + IntStatistics(PrimitiveType type) { + super(type); + } + + private IntStatistics(IntStatistics other) { + super(other.type()); + if (other.hasNonNullValue()) { + initializeStats(other.min, other.max); + } + setNumNulls(other.getNumNulls()); + } + @Override public void updateStats(int value) { if (!this.hasNonNullValue()) { @@ -62,23 +88,19 @@ public byte[] getMinBytes() { } @Override - public boolean isSmallerThan(long size) { - return !hasNonNullValue() || (8 < size); + String toString(Integer value) { + // TODO: implement unsigned int as required + return value.toString(); } @Override - public String toString() { - if (this.hasNonNullValue()) - return String.format("min: %d, max: %d, num_nulls: %d", min, max, this.getNumNulls()); - else if (!this.isEmpty()) - return String.format("num_nulls: %d, min/max is not defined", this.getNumNulls()); - else - return "no stats for this column"; + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (8 < size); } public void updateStats(int min_value, int max_value) { - if (min_value < min) { min = min_value; } - if (max_value > max) { max = max_value; } + if (comparator().compare(min, min_value) > 0) { min = min_value; } + if (comparator().compare(max, max_value) < 0) { max = max_value; } } public void initializeStats(int min_value, int max_value) { @@ -97,6 +119,14 @@ public Integer genericGetMax() { return max; } + public int compareMinToValue(int value) { + return comparator().compare(min, value); + } + + public int compareMaxToValue(int value) { + return comparator().compare(max, value); + } + public int getMax() { return max; } @@ -110,4 +140,9 @@ public void setMinMax(int min, int max) { this.min = min; this.markAsNotEmpty(); } + + @Override + public IntStatistics copy() { + return new IntStatistics(this); + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java index f7971efdd8..fd6d19cfda 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/LongStatistics.java @@ -19,12 +19,38 @@ package org.apache.parquet.column.statistics; import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Types; public class LongStatistics extends Statistics { + // A fake type object to be used to generate the proper comparator + private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.INT64) + .named("fake_int64_type"); + private long max; private long min; + /** + * @deprecated will be removed in 2.0.0. Use {@link Statistics#createStats(org.apache.parquet.schema.Type)} instead + */ + @Deprecated + public LongStatistics() { + this(DEFAULT_FAKE_TYPE); + } + + LongStatistics(PrimitiveType type) { + super(type); + } + + private LongStatistics(LongStatistics other) { + super(other.type()); + if (other.hasNonNullValue()) { + initializeStats(other.min, other.max); + } + setNumNulls(other.getNumNulls()); + } + @Override public void updateStats(long value) { if (!this.hasNonNullValue()) { @@ -62,23 +88,19 @@ public byte[] getMinBytes() { } @Override - public boolean isSmallerThan(long size) { - return !hasNonNullValue() || (16 < size); + String toString(Long value) { + // TODO: implement unsigned int as required + return value.toString(); } @Override - public String toString() { - if (this.hasNonNullValue()) - return String.format("min: %d, max: %d, num_nulls: %d", min, max, this.getNumNulls()); - else if (!this.isEmpty()) - return String.format("num_nulls: %d, min/max not defined", this.getNumNulls()); - else - return "no stats for this column"; + public boolean isSmallerThan(long size) { + return !hasNonNullValue() || (16 < size); } public void updateStats(long min_value, long max_value) { - if (min_value < min) { min = min_value; } - if (max_value > max) { max = max_value; } + if (comparator().compare(min, min_value) > 0) { min = min_value; } + if (comparator().compare(max, max_value) < 0) { max = max_value; } } public void initializeStats(long min_value, long max_value) { @@ -97,6 +119,14 @@ public Long genericGetMax() { return max; } + public int compareMinToValue(long value) { + return comparator().compare(min, value); + } + + public int compareMaxToValue(long value) { + return comparator().compare(max, value); + } + public long getMax() { return max; } @@ -110,4 +140,9 @@ public void setMinMax(long min, long max) { this.min = min; this.markAsNotEmpty(); } + + @Override + public LongStatistics copy() { + return new LongStatistics(this); + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java index 30153c0743..6eb23819ef 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java @@ -18,10 +18,15 @@ */ package org.apache.parquet.column.statistics; +import java.util.Arrays; +import java.util.Objects; + import org.apache.parquet.column.UnknownColumnTypeException; import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveComparator; +import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; -import java.util.Arrays; +import org.apache.parquet.schema.Type; /** @@ -31,10 +36,14 @@ */ public abstract class Statistics> { + private final PrimitiveType type; + private final PrimitiveComparator comparator; private boolean hasNonNullValue; private long num_nulls; - public Statistics() { + Statistics(PrimitiveType type) { + this.type = type; + this.comparator = type.comparator(); hasNonNullValue = false; num_nulls = 0; } @@ -43,27 +52,59 @@ public Statistics() { * Returns the typed statistics object based on the passed type parameter * @param type PrimitiveTypeName type of the column * @return instance of a typed statistics class + * @deprecated Use {@link #createStats(Type)} instead */ + @Deprecated public static Statistics getStatsBasedOnType(PrimitiveTypeName type) { - switch(type) { - case INT32: - return new IntStatistics(); - case INT64: - return new LongStatistics(); - case FLOAT: - return new FloatStatistics(); - case DOUBLE: - return new DoubleStatistics(); - case BOOLEAN: - return new BooleanStatistics(); - case BINARY: - return new BinaryStatistics(); - case INT96: - return new BinaryStatistics(); - case FIXED_LEN_BYTE_ARRAY: - return new BinaryStatistics(); - default: - throw new UnknownColumnTypeException(type); + switch (type) { + case INT32: + return new IntStatistics(); + case INT64: + return new LongStatistics(); + case FLOAT: + return new FloatStatistics(); + case DOUBLE: + return new DoubleStatistics(); + case BOOLEAN: + return new BooleanStatistics(); + case BINARY: + return new BinaryStatistics(); + case INT96: + return new BinaryStatistics(); + case FIXED_LEN_BYTE_ARRAY: + return new BinaryStatistics(); + default: + throw new UnknownColumnTypeException(type); + } + } + + /** + * Creates an empty {@code Statistics} instance for the specified type to be + * used for reading/writing the new min/max statistics used in the V2 format. + * + * @param type + * type of the column + * @return instance of a typed statistics class + */ + public static Statistics createStats(Type type) { + PrimitiveType primitive = type.asPrimitiveType(); + switch (primitive.getPrimitiveTypeName()) { + case INT32: + return new IntStatistics(primitive); + case INT64: + return new LongStatistics(primitive); + case FLOAT: + return new FloatStatistics(primitive); + case DOUBLE: + return new DoubleStatistics(primitive); + case BOOLEAN: + return new BooleanStatistics(primitive); + case BINARY: + case INT96: + case FIXED_LEN_BYTE_ARRAY: + return new BinaryStatistics(primitive); + default: + throw new UnknownColumnTypeException(primitive.getPrimitiveTypeName()); } } @@ -127,9 +168,10 @@ public boolean equals(Object other) { if (!(other instanceof Statistics)) return false; Statistics stats = (Statistics) other; - return Arrays.equals(stats.getMaxBytes(), this.getMaxBytes()) && - Arrays.equals(stats.getMinBytes(), this.getMinBytes()) && - stats.getNumNulls() == this.getNumNulls(); + return type.equals(stats.type) && + Arrays.equals(stats.getMaxBytes(), this.getMaxBytes()) && + Arrays.equals(stats.getMinBytes(), this.getMinBytes()) && + stats.getNumNulls() == this.getNumNulls(); } /** @@ -138,7 +180,8 @@ public boolean equals(Object other) { */ @Override public int hashCode() { - return 31 * Arrays.hashCode(getMaxBytes()) + 17 * Arrays.hashCode(getMinBytes()) + Long.valueOf(this.getNumNulls()).hashCode(); + return 31 * type.hashCode() + 31 * Arrays.hashCode(getMaxBytes()) + 17 * Arrays.hashCode(getMinBytes()) + + Long.valueOf(this.getNumNulls()).hashCode(); } /** @@ -150,14 +193,15 @@ public int hashCode() { public void mergeStatistics(Statistics stats) { if (stats.isEmpty()) return; - if (this.getClass() == stats.getClass()) { + // Merge stats only if they have the same type + if (type.equals(stats.type)) { incrementNumNulls(stats.getNumNulls()); if (stats.hasNonNullValue()) { mergeStatisticsMinMax(stats); markAsNotEmpty(); } } else { - throw new StatisticsClassException(this.getClass().toString(), stats.getClass().toString()); + throw StatisticsClassException.create(this, stats); } } @@ -175,9 +219,58 @@ public void mergeStatistics(Statistics stats) { */ abstract public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes); + /** + * Returns the min value in the statistics. The java natural order of the returned type defined by {@link + * T#compareTo(Object)} might not be the proper one. For example, UINT_32 requires unsigned comparison instead of the + * natural signed one. Use {@link #compareMinToValue(Comparable)} or the comparator returned by {@link #comparator()} to + * always get the proper ordering. + */ abstract public T genericGetMin(); + + /** + * Returns the max value in the statistics. The java natural order of the returned type defined by {@link + * T#compareTo(Object)} might not be the proper one. For example, UINT_32 requires unsigned comparison instead of the + * natural signed one. Use {@link #compareMaxToValue(Comparable)} or the comparator returned by {@link #comparator()} to + * always get the proper ordering. + */ abstract public T genericGetMax(); + /** + * Returns the {@link PrimitiveComparator} implementation to be used to compare two generic values in the proper way + * (for example, unsigned comparison for UINT_32). + */ + public final PrimitiveComparator comparator() { + return comparator; + } + + /** + * Compares min to the specified value in the proper way. It does the same as invoking + * {@code comparator().compare(genericGetMin(), value)}. The corresponding statistics implementations overload this + * method so the one with the primitive argument shall be used to avoid boxing/unboxing. + * + * @param value + * the value which {@code min} is to be compared to + * @return a negative integer, zero, or a positive integer as {@code min} is less than, equal to, or greater than + * {@code value}. + */ + public final int compareMinToValue(T value) { + return comparator.compare(genericGetMin(), value); + } + + /** + * Compares max to the specified value in the proper way. It does the same as invoking + * {@code comparator().compare(genericGetMax(), value)}. The corresponding statistics implementations overload this + * method so the one with the primitive argument shall be used to avoid boxing/unboxing. + * + * @param value + * the value which {@code max} is to be compared to + * @return a negative integer, zero, or a positive integer as {@code max} is less than, equal to, or greater than + * {@code value}. + */ + public final int compareMaxToValue(T value) { + return comparator.compare(genericGetMax(), value); + } + /** * Abstract method to return the max value as a byte array * @return byte array corresponding to the max value @@ -190,6 +283,24 @@ public void mergeStatistics(Statistics stats) { */ abstract public byte[] getMinBytes(); + /** + * Returns the string representation of min for debugging/logging purposes. + */ + public String minAsString() { + return toString(genericGetMin()); + } + + /** + * Returns the string representation of max for debugging/logging purposes. + */ + public String maxAsString() { + return toString(genericGetMax()); + } + + String toString(T value) { + return Objects.toString(value); + } + /** * Abstract method to return whether the min and max values fit in the given * size. @@ -198,11 +309,15 @@ public void mergeStatistics(Statistics stats) { */ abstract public boolean isSmallerThan(long size); - /** - * toString() to display min, max, num_nulls in a string - */ - abstract public String toString(); - + @Override + public String toString() { + if (this.hasNonNullValue()) + return String.format("min: %s, max: %s, num_nulls: %d", minAsString(), maxAsString(), this.getNumNulls()); + else if (!this.isEmpty()) + return String.format("num_nulls: %d, min/max not defined", this.getNumNulls()); + else + return "no stats for this column"; + } /** * Increments the null count by one @@ -250,13 +365,25 @@ public boolean isEmpty() { public boolean hasNonNullValue() { return hasNonNullValue; } - + /** * Sets the page/column as having a valid non-null value * kind of misnomer here - */ + */ protected void markAsNotEmpty() { hasNonNullValue = true; } + + /** + * @return a new independent statistics instance of this class. + */ + public abstract Statistics copy(); + + /** + * @return the primitive type object which this statistics is created for + */ + public PrimitiveType type() { + return type; + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/StatisticsClassException.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/StatisticsClassException.java index a242737616..4c23101074 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/StatisticsClassException.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/StatisticsClassException.java @@ -29,6 +29,18 @@ public class StatisticsClassException extends ParquetRuntimeException { private static final long serialVersionUID = 1L; public StatisticsClassException(String className1, String className2) { - super("Statistics classes mismatched: " + className1 + " vs. " + className2); + this("Statistics classes mismatched: " + className1 + " vs. " + className2); + } + + private StatisticsClassException(String msg) { + super(msg); + } + + static StatisticsClassException create(Statistics stats1, Statistics stats2) { + if (stats1.getClass() != stats2.getClass()) { + return new StatisticsClassException(stats1.getClass().toString(), stats2.getClass().toString()); + } + return new StatisticsClassException( + "Statistics comparator mismatched: " + stats1.comparator() + " vs. " + stats2.comparator()); } } diff --git a/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java index 22e4027e3c..8df0250638 100644 --- a/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/filter2/predicate/Statistics.java @@ -18,6 +18,8 @@ */ package org.apache.parquet.filter2.predicate; +import java.util.Comparator; + import static org.apache.parquet.Preconditions.checkNotNull; /** @@ -26,17 +28,51 @@ public class Statistics { private final T min; private final T max; + private final Comparator comparator; + // Intended for use only within Parquet itself. + /** + * @deprecated will be removed in 2.0.0. Use {@link #Statistics(Object, Object, Comparator)} instead + */ + @Deprecated public Statistics(T min, T max) { this.min = checkNotNull(min, "min"); this.max = checkNotNull(max, "max"); + this.comparator = null; } + // Intended for use only within Parquet itself. + public Statistics(T min, T max, Comparator comparator) { + this.min = checkNotNull(min, "min"); + this.max = checkNotNull(max, "max"); + this.comparator = checkNotNull(comparator, "comparator"); + } + + /** + * Returns the generic object representing the min value in the statistics. The + * natural ordering of type {@code T} defined by the {@code compareTo} method + * might not be appropriate for the actual logical type. Use + * {@link #getComparator()} for comparing. + */ public T getMin() { return min; } + /** + * Returns the generic object representing the max value in the statistics. The + * natural ordering of type {@code T} defined by the {@code compareTo} method + * might not be appropriate for the actual logical type. Use + * {@link #getComparator()} for comparing. + */ public T getMax() { return max; } + + /** + * Returns the comparator to be used to compare two generic values in the proper way (e.g. unsigned comparison for + * UINT_32) + */ + public Comparator getComparator() { + return comparator; + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/filter2/recordlevel/IncrementallyUpdatedFilterPredicateBuilderBase.java b/parquet-column/src/main/java/org/apache/parquet/filter2/recordlevel/IncrementallyUpdatedFilterPredicateBuilderBase.java index 8def88eec4..c1f759c377 100644 --- a/parquet-column/src/main/java/org/apache/parquet/filter2/recordlevel/IncrementallyUpdatedFilterPredicateBuilderBase.java +++ b/parquet-column/src/main/java/org/apache/parquet/filter2/recordlevel/IncrementallyUpdatedFilterPredicateBuilderBase.java @@ -23,6 +23,7 @@ import java.util.List; import java.util.Map; +import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.filter2.predicate.FilterPredicate; import org.apache.parquet.filter2.predicate.FilterPredicate.Visitor; @@ -30,6 +31,8 @@ import org.apache.parquet.filter2.predicate.Operators.Not; import org.apache.parquet.filter2.predicate.Operators.Or; import org.apache.parquet.filter2.recordlevel.IncrementallyUpdatedFilterPredicate.ValueInspector; +import org.apache.parquet.io.PrimitiveColumnIO; +import org.apache.parquet.schema.PrimitiveComparator; import static org.apache.parquet.Preconditions.checkArgument; @@ -55,9 +58,20 @@ public abstract class IncrementallyUpdatedFilterPredicateBuilderBase implements Visitor { private boolean built = false; private final Map> valueInspectorsByColumn = new HashMap>(); + private final Map> comparatorsByColumn = new HashMap<>(); + @Deprecated public IncrementallyUpdatedFilterPredicateBuilderBase() { } + public IncrementallyUpdatedFilterPredicateBuilderBase(List leaves) { + for (PrimitiveColumnIO leaf : leaves) { + ColumnDescriptor descriptor = leaf.getColumnDescriptor(); + ColumnPath path = ColumnPath.get(descriptor.getPath()); + PrimitiveComparator comparator = descriptor.getPrimitiveType().comparator(); + comparatorsByColumn.put(path, comparator); + } + } + public final IncrementallyUpdatedFilterPredicate build(FilterPredicate pred) { checkArgument(!built, "This builder has already been used"); IncrementallyUpdatedFilterPredicate incremental = pred.accept(this); @@ -78,6 +92,11 @@ public Map> getValueInspectorsByColumn() { return valueInspectorsByColumn; } + @SuppressWarnings("unchecked") + protected final PrimitiveComparator getComparator(ColumnPath path) { + return (PrimitiveComparator) comparatorsByColumn.get(path); + } + @Override public final IncrementallyUpdatedFilterPredicate visit(And and) { return new IncrementallyUpdatedFilterPredicate.And(and.getLeft().accept(this), and.getRight().accept(this)); diff --git a/parquet-column/src/main/java/org/apache/parquet/io/MessageColumnIO.java b/parquet-column/src/main/java/org/apache/parquet/io/MessageColumnIO.java index 67efdb3a37..7346c5a35f 100644 --- a/parquet-column/src/main/java/org/apache/parquet/io/MessageColumnIO.java +++ b/parquet-column/src/main/java/org/apache/parquet/io/MessageColumnIO.java @@ -109,7 +109,7 @@ public RecordReader getRecordReader(final PageReadStore columns, public RecordReader visit(FilterPredicateCompat filterPredicateCompat) { FilterPredicate predicate = filterPredicateCompat.getFilterPredicate(); - IncrementallyUpdatedFilterPredicateBuilder builder = new IncrementallyUpdatedFilterPredicateBuilder(); + IncrementallyUpdatedFilterPredicateBuilder builder = new IncrementallyUpdatedFilterPredicateBuilder(leaves); IncrementallyUpdatedFilterPredicate streamingPredicate = builder.build(predicate); RecordMaterializer filteringRecordMaterializer = new FilteringRecordMaterializer( recordMaterializer, diff --git a/parquet-column/src/main/java/org/apache/parquet/io/PrimitiveColumnIO.java b/parquet-column/src/main/java/org/apache/parquet/io/PrimitiveColumnIO.java index 15c28c8cc9..e40b24f133 100644 --- a/parquet-column/src/main/java/org/apache/parquet/io/PrimitiveColumnIO.java +++ b/parquet-column/src/main/java/org/apache/parquet/io/PrimitiveColumnIO.java @@ -52,10 +52,9 @@ void setLevels(int r, int d, String[] fieldPath, int[] fieldIndexPath, List buf.get(i + offset2)) { - return -1; - } - } - // check remainder - if (length1 == length2) { return 0; } - else if (length1 < length2) { return 1;} - else { return -1; } - } - - private static final int compareTwoByteBuffers(ByteBuffer buf1, int offset1, int length1, - ByteBuffer buf2, int offset2, int length2) { - if (buf1 == null && buf2 == null) return 0; - int min_length = (length1 < length2) ? length1 : length2; - for (int i = 0; i < min_length; i++) { - if (buf1.get(i + offset1) < buf2.get(i + offset2)) { - return 1; - } - if (buf1.get(i + offset1) > buf2.get(i + offset2)) { - return -1; - } - } - // check remainder - if (length1 == length2) { return 0; } - else if (length1 < length2) { return 1;} - else { return -1; } - } - - private static final int compareTwoByteArrays(byte[] array1, int offset1, int length1, - byte[] array2, int offset2, int length2) { - if (array1 == null && array2 == null) return 0; - if (array1 == array2 && offset1 == offset2 && length1 == length2) return 0; - int min_length = (length1 < length2) ? length1 : length2; - for (int i = 0; i < min_length; i++) { - if (array1[i + offset1] < array2[i + offset2]) { - return 1; - } - if (array1[i + offset1] > array2[i + offset2]) { - return -1; - } - } - // check remainder - if (length1 == length2) { return 0; } - else if (length1 < length2) { return 1;} - else { return -1; } - } } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/ColumnOrder.java b/parquet-column/src/main/java/org/apache/parquet/schema/ColumnOrder.java new file mode 100644 index 0000000000..144a93a06a --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/schema/ColumnOrder.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.schema; + +import org.apache.parquet.Preconditions; + +/** + * Class representing the column order with all the related parameters. + */ +public class ColumnOrder { + /** + * The enum type of the column order. + */ + public enum ColumnOrderName { + /** + * Representing the case when the defined column order is undefined (e.g. the file is written by a later API and the + * current one does not support the related column order). No statistics will be written/read in this case. + */ + UNDEFINED, + /** + * Type defined order meaning that the comparison order of the elements are based on its type. + */ + TYPE_DEFINED_ORDER + } + + private static final ColumnOrder UNDEFINED_COLUMN_ORDER = new ColumnOrder(ColumnOrderName.UNDEFINED); + private static final ColumnOrder TYPE_DEFINED_COLUMN_ORDER = new ColumnOrder(ColumnOrderName.TYPE_DEFINED_ORDER); + + /** + * @return a {@link ColumnOrder} instance representing an undefined order + * @see ColumnOrderName#UNDEFINED + */ + public static ColumnOrder undefined() { + return UNDEFINED_COLUMN_ORDER; + } + + /** + * @return a {@link ColumnOrder} instance representing a type defined order + * @see ColumnOrderName#TYPE_DEFINED_ORDER + */ + public static ColumnOrder typeDefined() { + return TYPE_DEFINED_COLUMN_ORDER; + } + + private final ColumnOrderName columnOrderName; + + private ColumnOrder(ColumnOrderName columnOrderName) { + this.columnOrderName = Preconditions.checkNotNull(columnOrderName, "columnOrderName"); + } + + public ColumnOrderName getColumnOrderName() { + return columnOrderName; + } + + /** + * {@inheritDoc} + */ + @Override + public boolean equals(Object obj) { + if (obj instanceof ColumnOrder) { + return columnOrderName == ((ColumnOrder) obj).columnOrderName; + } + return false; + } + + /** + * {@inheritDoc} + */ + @Override + public int hashCode() { + return columnOrderName.hashCode(); + } + + /** + * {@inheritDoc} + */ + @Override + public String toString() { + return columnOrderName.toString(); + } +} diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/MessageType.java b/parquet-column/src/main/java/org/apache/parquet/schema/MessageType.java index 1e26ed2425..afbc416f67 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/MessageType.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/MessageType.java @@ -95,8 +95,7 @@ public ColumnDescriptor getColumnDescription(String[] path) { int maxRep = getMaxRepetitionLevel(path); int maxDef = getMaxDefinitionLevel(path); PrimitiveType type = getType(path).asPrimitiveType(); - return new ColumnDescriptor(path, type.getPrimitiveTypeName(), - type.getTypeLength(), maxRep, maxDef); + return new ColumnDescriptor(path, type, maxRep, maxDef); } public List getPaths() { @@ -111,8 +110,7 @@ public List getColumns() { PrimitiveType primitiveType = getType(path).asPrimitiveType(); columns.add(new ColumnDescriptor( path, - primitiveType.getPrimitiveTypeName(), - primitiveType.getTypeLength(), + primitiveType, getMaxRepetitionLevel(path), getMaxDefinitionLevel(path))); } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java new file mode 100644 index 0000000000..085a67a26d --- /dev/null +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java @@ -0,0 +1,290 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.schema; + +import org.apache.parquet.io.api.Binary; + +import java.nio.ByteBuffer; +import java.util.Comparator; + +/** + * {@link Comparator} implementation that also supports the comparison of the related primitive type to avoid the + * performance penalty of boxing/unboxing. The {@code compare} methods for the not supported primitive types throw + * {@link UnsupportedOperationException}. + */ +public abstract class PrimitiveComparator implements Comparator { + + public int compare(boolean b1, boolean b2) { + throw new UnsupportedOperationException( + "compare(boolean, boolean) was called on a non-boolean comparator: " + toString()); + } + + public int compare(int i1, int i2) { + throw new UnsupportedOperationException("compare(int, int) was called on a non-int comparator: " + toString()); + } + + public int compare(long l1, long l2) { + throw new UnsupportedOperationException("compare(long, long) was called on a non-long comparator: " + toString()); + } + + public int compare(float f1, float f2) { + throw new UnsupportedOperationException( + "compare(float, float) was called on a non-float comparator: " + toString()); + } + + public int compare(double d1, double d2) { + throw new UnsupportedOperationException( + "compare(double, double) was called on a non-double comparator: " + toString()); + } + + @Override + public final int compare(T o1, T o2) { + if (o1 == null) { + return o2 == null ? 0 : -1; + } + return o2 == null ? 1 : compareNotNulls(o1, o2); + } + + abstract int compareNotNulls(T o1, T o2); + + static final PrimitiveComparator BOOLEAN_COMPARATOR = new PrimitiveComparator() { + @Override + int compareNotNulls(Boolean o1, Boolean o2) { + return compare(o1.booleanValue(), o2.booleanValue()); + } + + @Override + public int compare(boolean b1, boolean b2) { + return Boolean.compare(b1, b2); + } + + @Override + public String toString() { + return "BOOLEAN_COMPARATOR"; + } + }; + + private static abstract class IntComparator extends PrimitiveComparator { + @Override + int compareNotNulls(Integer o1, Integer o2) { + return compare(o1.intValue(), o2.intValue()); + } + } + + static final PrimitiveComparator SIGNED_INT32_COMPARATOR = new IntComparator() { + @Override + public int compare(int i1, int i2) { + return Integer.compare(i1, i2); + } + + @Override + public String toString() { + return "SIGNED_INT32_COMPARATOR"; + } + }; + + static final PrimitiveComparator UNSIGNED_INT32_COMPARATOR = new IntComparator() { + @Override + public int compare(int i1, int i2) { + // Implemented based on com.google.common.primitives.UnsignedInts.compare(int, int) + return Integer.compare(i1 ^ Integer.MIN_VALUE, i2 ^ Integer.MIN_VALUE); + } + + @Override + public String toString() { + return "UNSIGNED_INT32_COMPARATOR"; + } + }; + + private static abstract class LongComparator extends PrimitiveComparator { + @Override + int compareNotNulls(Long o1, Long o2) { + return compare(o1.longValue(), o2.longValue()); + } + } + + static final PrimitiveComparator SIGNED_INT64_COMPARATOR = new LongComparator() { + @Override + public int compare(long l1, long l2) { + return Long.compare(l1, l2); + } + + @Override + public String toString() { + return "SIGNED_INT64_COMPARATOR"; + } + }; + + static final PrimitiveComparator UNSIGNED_INT64_COMPARATOR = new LongComparator() { + @Override + public int compare(long l1, long l2) { + // Implemented based on com.google.common.primitives.UnsignedLongs.compare(long, long) + return Long.compare(l1 ^ Long.MIN_VALUE, l2 ^ Long.MIN_VALUE); + } + + @Override + public String toString() { + return "UNSIGNED_INT64_COMPARATOR"; + } + }; + + static final PrimitiveComparator FLOAT_COMPARATOR = new PrimitiveComparator() { + @Override + int compareNotNulls(Float o1, Float o2) { + return compare(o1.floatValue(), o2.floatValue()); + } + + @Override + public int compare(float f1, float f2) { + return Float.compare(f1, f2); + } + + @Override + public String toString() { + return "FLOAT_COMPARATOR"; + } + }; + + static final PrimitiveComparator DOUBLE_COMPARATOR = new PrimitiveComparator() { + @Override + int compareNotNulls(Double o1, Double o2) { + return compare(o1.doubleValue(), o2.doubleValue()); + } + + @Override + public int compare(double d1, double d2) { + return Double.compare(d1, d2); + } + + @Override + public String toString() { + return "DOUBLE_COMPARATOR"; + } + }; + + private static abstract class BinaryComparator extends PrimitiveComparator { + @Override + int compareNotNulls(Binary o1, Binary o2) { + return compare(o1.toByteBuffer(), o2.toByteBuffer()); + } + + abstract int compare(ByteBuffer b1, ByteBuffer b2); + + final int toUnsigned(byte b) { + return b & 0xFF; + } + } + + public static final PrimitiveComparator UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR = new BinaryComparator() { + @Override + int compare(ByteBuffer b1, ByteBuffer b2) { + int l1 = b1.remaining(); + int l2 = b2.remaining(); + int p1 = b1.position(); + int p2 = b2.position(); + int minL = Math.min(l1, l2); + + for (int i = 0; i < minL; ++i) { + int result = unsignedCompare(b1.get(p1 + i), b2.get(p2 + i)); + if (result != 0) { + return result; + } + } + + return l1 - l2; + } + + private int unsignedCompare(byte b1, byte b2) { + return toUnsigned(b1) - toUnsigned(b2); + } + + @Override + public String toString() { + return "UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR"; + } + }; + + /* + * This comparator is for comparing two signed decimal values represented in twos-complement binary. In case of the + * binary length of one value is shorter than the other it will be padded by the corresponding prefix (0xFF for + * negative, 0x00 for positive values). + */ + static final PrimitiveComparator BINARY_AS_SIGNED_INTEGER_COMPARATOR = new BinaryComparator() { + private static final int NEGATIVE_PADDING = 0xFF; + private static final int POSITIVE_PADDING = 0; + + @Override + int compare(ByteBuffer b1, ByteBuffer b2) { + int l1 = b1.remaining(); + int l2 = b2.remaining(); + int p1 = b1.position(); + int p2 = b2.position(); + + boolean isNegative1 = l1 > 0 ? b1.get(p1) < 0 : false; + boolean isNegative2 = l2 > 0 ? b2.get(p2) < 0 : false; + if (isNegative1 != isNegative2) { + return isNegative1 ? -1 : 1; + } + + int result = 0; + + // Compare the beginning of the longer buffer with the proper padding + if (l1 < l2) { + int lengthDiff = l2 - l1; + result = -compareWithPadding(lengthDiff, b2, p2, isNegative1 ? NEGATIVE_PADDING : POSITIVE_PADDING); + p2 += lengthDiff; + } else if (l1 > l2) { + int lengthDiff = l1 - l2; + result = compareWithPadding(lengthDiff, b1, p1, isNegative2 ? NEGATIVE_PADDING : POSITIVE_PADDING); + p1 += lengthDiff; + } + + // The beginning of the longer buffer equals to the padding or the lengths are equal + if (result == 0) { + result = compare(l1, b1, p1, b2, p2); + } + return result; + } + + private int compareWithPadding(int length, ByteBuffer b, int p, int paddingByte) { + for (int i = p, n = p + length; i < n; ++i) { + int result = toUnsigned(b.get(i)) - paddingByte; + if (result != 0) { + return result; + } + } + return 0; + } + + private int compare(int length, ByteBuffer b1, int p1, ByteBuffer b2, int p2) { + for (int i = 0; i < length; ++i) { + int result = toUnsigned(b1.get(p1 + i)) - toUnsigned(b2.get(p2 + i)); + if (result != 0) { + return result; + } + } + return 0; + } + + @Override + public String toString() { + return "BINARY_AS_SIGNED_INTEGER_COMPARATOR"; + } + }; +} diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java index 8056188d25..2d7491f610 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveType.java @@ -22,11 +22,14 @@ import java.util.List; import java.util.Locale; +import org.apache.parquet.Preconditions; +import org.apache.parquet.ShouldNeverHappenException; import org.apache.parquet.column.ColumnReader; import org.apache.parquet.io.InvalidRecordException; import org.apache.parquet.io.api.Binary; import org.apache.parquet.io.api.PrimitiveConverter; import org.apache.parquet.io.api.RecordConsumer; +import org.apache.parquet.schema.ColumnOrder.ColumnOrderName; /** @@ -86,6 +89,26 @@ public void addValueToPrimitiveConverter( public T convert(PrimitiveTypeNameConverter converter) throws E { return converter.convertINT64(this); } + + @Override + PrimitiveComparator comparator(OriginalType logicalType) { + if (logicalType == null) { + return PrimitiveComparator.SIGNED_INT64_COMPARATOR; + } + switch (logicalType) { + case UINT_64: + return PrimitiveComparator.UNSIGNED_INT64_COMPARATOR; + case INT_64: + case DECIMAL: + case TIME_MICROS: + case TIMESTAMP_MILLIS: + case TIMESTAMP_MICROS: + return PrimitiveComparator.SIGNED_INT64_COMPARATOR; + default: + throw new ShouldNeverHappenException( + "No comparator logic implemented for INT64 logical type: " + logicalType); + } + } }, INT32("getInteger", Integer.TYPE) { @Override @@ -109,6 +132,29 @@ public void addValueToPrimitiveConverter( public T convert(PrimitiveTypeNameConverter converter) throws E { return converter.convertINT32(this); } + + @Override + PrimitiveComparator comparator(OriginalType logicalType) { + if (logicalType == null) { + return PrimitiveComparator.SIGNED_INT32_COMPARATOR; + } + switch (logicalType) { + case UINT_8: + case UINT_16: + case UINT_32: + return PrimitiveComparator.UNSIGNED_INT32_COMPARATOR; + case INT_8: + case INT_16: + case INT_32: + case DECIMAL: + case DATE: + case TIME_MILLIS: + return PrimitiveComparator.SIGNED_INT32_COMPARATOR; + default: + throw new ShouldNeverHappenException( + "No comparator logic implemented for INT32 logical type: " + logicalType); + } + } }, BOOLEAN("getBoolean", Boolean.TYPE) { @Override @@ -132,6 +178,11 @@ public void addValueToPrimitiveConverter( public T convert(PrimitiveTypeNameConverter converter) throws E { return converter.convertBOOLEAN(this); } + + @Override + PrimitiveComparator comparator(OriginalType logicalType) { + return PrimitiveComparator.BOOLEAN_COMPARATOR; + } }, BINARY("getBinary", Binary.class) { @Override @@ -155,6 +206,25 @@ public void addValueToPrimitiveConverter( public T convert(PrimitiveTypeNameConverter converter) throws E { return converter.convertBINARY(this); } + + @Override + PrimitiveComparator comparator(OriginalType logicalType) { + if (logicalType == null) { + return PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR; + } + switch (logicalType) { + case DECIMAL: + return PrimitiveComparator.BINARY_AS_SIGNED_INTEGER_COMPARATOR; + case UTF8: + case ENUM: + case JSON: + case BSON: + return PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR; + default: + throw new ShouldNeverHappenException( + "No comparator logic implemented for BINARY logical type: " + logicalType); + } + } }, FLOAT("getFloat", Float.TYPE) { @Override @@ -178,6 +248,11 @@ public void addValueToPrimitiveConverter( public T convert(PrimitiveTypeNameConverter converter) throws E { return converter.convertFLOAT(this); } + + @Override + PrimitiveComparator comparator(OriginalType logicalType) { + return PrimitiveComparator.FLOAT_COMPARATOR; + } }, DOUBLE("getDouble", Double.TYPE) { @Override @@ -201,6 +276,11 @@ public void addValueToPrimitiveConverter( public T convert(PrimitiveTypeNameConverter converter) throws E { return converter.convertDOUBLE(this); } + + @Override + PrimitiveComparator comparator(OriginalType logicalType) { + return PrimitiveComparator.DOUBLE_COMPARATOR; + } }, INT96("getBinary", Binary.class) { @Override @@ -222,6 +302,11 @@ public void addValueToPrimitiveConverter( public T convert(PrimitiveTypeNameConverter converter) throws E { return converter.convertINT96(this); } + + @Override + PrimitiveComparator comparator(OriginalType logicalType) { + return PrimitiveComparator.BINARY_AS_SIGNED_INTEGER_COMPARATOR; + } }, FIXED_LEN_BYTE_ARRAY("getBinary", Binary.class) { @Override @@ -245,6 +330,22 @@ public void addValueToPrimitiveConverter( public T convert(PrimitiveTypeNameConverter converter) throws E { return converter.convertFIXED_LEN_BYTE_ARRAY(this); } + + @Override + PrimitiveComparator comparator(OriginalType logicalType) { + if (logicalType == null) { + return PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR; + } + switch (logicalType) { + case DECIMAL: + return PrimitiveComparator.BINARY_AS_SIGNED_INTEGER_COMPARATOR; + case INTERVAL: + return PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR; + default: + throw new ShouldNeverHappenException( + "No comparator logic implemented for FIXED_LEN_BYTE_ARRAY logical type: " + logicalType); + } + } }; public final String getMethod; @@ -275,11 +376,14 @@ abstract public void addValueToPrimitiveConverter( abstract public T convert(PrimitiveTypeNameConverter converter) throws E; + abstract PrimitiveComparator comparator(OriginalType logicalType); + } private final PrimitiveTypeName primitive; private final int length; private final DecimalMetadata decimalMeta; + private final ColumnOrder columnOrder; /** * @param repetition OPTIONAL, REPEATED, REQUIRED @@ -337,10 +441,61 @@ public PrimitiveType(Repetition repetition, PrimitiveTypeName primitive, public PrimitiveType(Repetition repetition, PrimitiveTypeName primitive, int length, String name, OriginalType originalType, DecimalMetadata decimalMeta, ID id) { + this(repetition, primitive, length, name, originalType, decimalMeta, id, null); + } + + PrimitiveType(Repetition repetition, PrimitiveTypeName primitive, + int length, String name, OriginalType originalType, + DecimalMetadata decimalMeta, ID id, ColumnOrder columnOrder) { super(name, repetition, originalType, id); this.primitive = primitive; this.length = length; this.decimalMeta = decimalMeta; + + if (columnOrder == null) { + columnOrder = primitive == PrimitiveTypeName.INT96 || originalType == OriginalType.INTERVAL + ? ColumnOrder.undefined() + : ColumnOrder.typeDefined(); + } + this.columnOrder = requireValidColumnOrder(columnOrder); + } + + private ColumnOrder requireValidColumnOrder(ColumnOrder columnOrder) { + if (primitive == PrimitiveTypeName.INT96) { + Preconditions.checkArgument(columnOrder.getColumnOrderName() == ColumnOrderName.UNDEFINED, + "The column order {} is not supported by INT96", columnOrder); + } + if (getOriginalType() != null) { + // Explicitly listing all the logical types to avoid having unsupported column orders new types accidentally + switch (getOriginalType()) { + case INT_8: + case INT_16: + case INT_32: + case INT_64: + case UINT_8: + case UINT_16: + case UINT_32: + case UINT_64: + case UTF8: + case DECIMAL: + case DATE: + case TIME_MILLIS: + case TIME_MICROS: + case TIMESTAMP_MILLIS: + case TIMESTAMP_MICROS: + case ENUM: + case JSON: + case BSON: + // Currently any available column order is valid + break; + case INTERVAL: + default: + Preconditions.checkArgument(columnOrder.getColumnOrderName() == ColumnOrderName.UNDEFINED, + "The column order {} is not supported by {} ({})", columnOrder, primitive, getOriginalType()); + break; + } + } + return columnOrder; } /** @@ -349,7 +504,8 @@ public PrimitiveType(Repetition repetition, PrimitiveTypeName primitive, */ @Override public PrimitiveType withId(int id) { - return new PrimitiveType(getRepetition(), primitive, length, getName(), getOriginalType(), decimalMeta, new ID(id)); + return new PrimitiveType(getRepetition(), primitive, length, getName(), getOriginalType(), decimalMeta, new ID(id), + columnOrder); } /** @@ -441,6 +597,7 @@ protected boolean equals(Type other) { return super.equals(other) && primitive == otherPrimitive.getPrimitiveTypeName() && length == otherPrimitive.length + && columnOrder.equals(otherPrimitive.columnOrder) && eqOrBothNull(decimalMeta, otherPrimitive.decimalMeta); } @@ -452,6 +609,7 @@ public int hashCode() { int hash = super.hashCode(); hash = hash * 31 + primitive.hashCode(); hash = hash * 31 + length; + hash = hash * 31 + columnOrder.hashCode(); if (decimalMeta != null) { hash = hash * 31 + decimalMeta.hashCode(); } @@ -519,6 +677,11 @@ private void reportSchemaMergeError(Type toMerge) { throw new IncompatibleSchemaModificationException("can not merge type " + toMerge + " into " + this); } + private void reportSchemaMergeErrorWithColumnOrder(Type toMerge) { + throw new IncompatibleSchemaModificationException("can not merge type " + toMerge + " with column order " + + toMerge.asPrimitiveType().columnOrder() + " into " + this + " with column order " + columnOrder()); + } + @Override protected Type union(Type toMerge, boolean strict) { if (!toMerge.isPrimitive()) { @@ -537,6 +700,11 @@ protected Type union(Type toMerge, boolean strict) { if (primitive == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY && length != toMergeLength) { reportSchemaMergeError(toMerge); } + + // Can't merge primitive fields with different column orders + if (!columnOrder().equals(toMerge.asPrimitiveType().columnOrder())) { + reportSchemaMergeErrorWithColumnOrder(toMerge); + } } Types.PrimitiveBuilder builder = Types.primitive(primitive, toMerge.getRepetition()); @@ -547,4 +715,21 @@ protected Type union(Type toMerge, boolean strict) { return builder.as(getOriginalType()).named(getName()); } + + /** + * Returns the {@link Type} specific comparator for properly comparing values. The natural ordering of the values + * might not proper in certain cases (e.g. {@code UINT_32} requires unsigned comparison of {@code int} values while + * the natural ordering is signed.) + */ + @SuppressWarnings("unchecked") + public PrimitiveComparator comparator() { + return (PrimitiveComparator) getPrimitiveTypeName().comparator(getOriginalType()); + } + + /** + * @return the column order for this type + */ + public ColumnOrder columnOrder() { + return columnOrder; + } } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/Types.java b/parquet-column/src/main/java/org/apache/parquet/schema/Types.java index e81daaea9a..0422a9d431 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/Types.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/Types.java @@ -23,6 +23,7 @@ import java.util.List; import org.apache.parquet.Preconditions; +import org.apache.parquet.schema.ColumnOrder.ColumnOrderName; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type.ID; import org.slf4j.Logger; @@ -316,6 +317,7 @@ public P named(String name) { private int length = NOT_SET; private int precision = NOT_SET; private int scale = NOT_SET; + private ColumnOrder columnOrder; private BasePrimitiveBuilder(P parent, PrimitiveTypeName type) { super(parent); @@ -374,6 +376,22 @@ public THIS scale(int scale) { return self(); } + /** + * Adds the column order for the primitive type. + *

+ * In case of not set the default column order is {@link ColumnOrderName#TYPE_DEFINED_ORDER} except the type + * {@link PrimitiveTypeName#INT96} and the types annotated by {@link OriginalType#INTERVAL} where the default column + * order is {@link ColumnOrderName#UNDEFINED}. + * + * @param columnOrder + * the column order for the primitive type + * @return this builder for method chaining + */ + public THIS columnOrder(ColumnOrder columnOrder) { + this.columnOrder = columnOrder; + return self(); + } + @Override protected PrimitiveType build(String name) { if (PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY == primitiveType) { @@ -457,7 +475,7 @@ protected PrimitiveType build(String name) { } } - return new PrimitiveType(repetition, primitiveType, length, name, originalType, meta, id); + return new PrimitiveType(repetition, primitiveType, length, name, originalType, meta, id, columnOrder); } private static long maxPrecision(int numBytes) { diff --git a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java index 690c7e1730..476fbb3376 100644 --- a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java +++ b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java @@ -74,6 +74,13 @@ public void testIntMinMax() { assertEquals(statsNeg.getMax(), 54); assertEquals(statsNeg.getMin(), -66); + assertTrue(statsNeg.compareMaxToValue(55) < 0); + assertTrue(statsNeg.compareMaxToValue(54) == 0); + assertTrue(statsNeg.compareMaxToValue(5) > 0); + assertTrue(statsNeg.compareMinToValue(0) < 0); + assertTrue(statsNeg.compareMinToValue(-66) == 0); + assertTrue(statsNeg.compareMinToValue(-67) > 0); + // Test converting to and from byte[] byte[] intMaxBytes = statsNeg.getMaxBytes(); byte[] intMinBytes = statsNeg.getMinBytes(); @@ -135,6 +142,13 @@ public void testLongMinMax() { assertEquals(statsNeg.getMax(), 993); assertEquals(statsNeg.getMin(), -9914); + assertTrue(statsNeg.compareMaxToValue(994) < 0); + assertTrue(statsNeg.compareMaxToValue(993) == 0); + assertTrue(statsNeg.compareMaxToValue(-1000) > 0); + assertTrue(statsNeg.compareMinToValue(10000) < 0); + assertTrue(statsNeg.compareMinToValue(-9914) == 0); + assertTrue(statsNeg.compareMinToValue(-9915) > 0); + // Test converting to and from byte[] byte[] longMaxBytes = statsNeg.getMaxBytes(); byte[] longMinBytes = statsNeg.getMinBytes(); @@ -196,6 +210,13 @@ public void testFloatMinMax() { assertEquals(statsNeg.getMax(), 0.65f, 1e-10); assertEquals(statsNeg.getMin(), -412.99f, 1e-10); + assertTrue(statsNeg.compareMaxToValue(1) < 0); + assertTrue(statsNeg.compareMaxToValue(0.65F) == 0); + assertTrue(statsNeg.compareMaxToValue(0.649F) > 0); + assertTrue(statsNeg.compareMinToValue(-412.98F) < 0); + assertTrue(statsNeg.compareMinToValue(-412.99F) == 0); + assertTrue(statsNeg.compareMinToValue(-450) > 0); + // Test converting to and from byte[] byte[] floatMaxBytes = statsNeg.getMaxBytes(); byte[] floatMinBytes = statsNeg.getMinBytes(); @@ -257,6 +278,13 @@ public void testDoubleMinMax() { assertEquals(statsNeg.getMax(), 23.0d, 1e-10); assertEquals(statsNeg.getMin(), -944.5d, 1e-10); + assertTrue(statsNeg.compareMaxToValue(23.0001D) < 0); + assertTrue(statsNeg.compareMaxToValue(23D) == 0); + assertTrue(statsNeg.compareMaxToValue(0D) > 0); + assertTrue(statsNeg.compareMinToValue(-400D) < 0); + assertTrue(statsNeg.compareMinToValue(-944.5D) == 0); + assertTrue(statsNeg.compareMinToValue(-944.500001D) > 0); + // Test converting to and from byte[] byte[] doubleMaxBytes = statsNeg.getMaxBytes(); byte[] doubleMinBytes = statsNeg.getMinBytes(); diff --git a/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java b/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java index a541e1bd13..081559719c 100644 --- a/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java +++ b/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java @@ -248,4 +248,24 @@ private void testBinary(BinaryFactory bf, boolean reused) throws Exception { testSerializable(bf, reused); } + + @Test + public void testCompare() { + Binary b1 = Binary.fromCharSequence("aaaaaaaa"); + Binary b2 = Binary.fromString("aaaaaaab"); + Binary b3 = Binary.fromReusedByteArray("aaaaaaaaaaa".getBytes(), 1, 8); + Binary b4 = Binary.fromConstantByteBuffer(ByteBuffer.wrap("aaaaaaac".getBytes())); + + assertTrue(b1.compareTo(b2) < 0); + assertTrue(b2.compareTo(b1) > 0); + assertTrue(b3.compareTo(b4) < 0); + assertTrue(b4.compareTo(b3) > 0); + assertTrue(b1.compareTo(b4) < 0); + assertTrue(b4.compareTo(b1) > 0); + assertTrue(b2.compareTo(b4) < 0); + assertTrue(b4.compareTo(b2) > 0); + + assertTrue(b1.compareTo(b3) == 0); + assertTrue(b3.compareTo(b1) == 0); + } } diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestMessageType.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestMessageType.java index 4add1740ce..05619385bc 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestMessageType.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestMessageType.java @@ -21,9 +21,12 @@ import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; import static org.apache.parquet.schema.OriginalType.LIST; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; + import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT96; import static org.apache.parquet.schema.Type.Repetition.OPTIONAL; import static org.apache.parquet.schema.Type.Repetition.REPEATED; import static org.apache.parquet.schema.Type.Repetition.REQUIRED; @@ -188,6 +191,48 @@ public void testMergeSchemaWithOriginalType() throws Exception { t5.union(t6)); } + @Test + public void testMergeSchemaWithColumnOrder() { + MessageType m1 = Types.buildMessage().addFields( + Types.requiredList().element( + Types.optional(BINARY).columnOrder(ColumnOrder.undefined()).named("a") + ).named("g"), + Types.optional(INT96).named("b") + ).named("root"); + MessageType m2 = Types.buildMessage().addFields( + Types.requiredList().element( + Types.optional(BINARY).columnOrder(ColumnOrder.undefined()).named("a") + ).named("g"), + Types.optional(BINARY).named("c") + ).named("root"); + MessageType m3 = Types.buildMessage().addFields( + Types.requiredList().element( + Types.optional(BINARY).named("a") + ).named("g") + ).named("root"); + + assertEquals( + Types.buildMessage().addFields( + Types.requiredList().element( + Types.optional(BINARY).named("a") + ).named("g"), + Types.optional(INT96).named("b"), + Types.optional(BINARY).named("c") + ).named("root"), + m1.union(m2)); + try { + m1.union(m3); + fail("An IncompatibleSchemaModificationException should have been thrown"); + } catch (Exception e) { + assertTrue( + "The thrown exception should have been IncompatibleSchemaModificationException but was " + e.getClass(), + e instanceof IncompatibleSchemaModificationException); + assertEquals( + "can not merge type optional binary a with column order TYPE_DEFINED_ORDER into optional binary a with column order UNDEFINED", + e.getMessage()); + } + } + @Test public void testIDs() throws Exception { MessageType schema = new MessageType("test", diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java new file mode 100644 index 0000000000..3f9d6431b5 --- /dev/null +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveComparator.java @@ -0,0 +1,311 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.parquet.schema; + +import org.apache.parquet.io.api.Binary; +import org.junit.Test; + +import java.math.BigInteger; +import java.nio.ByteBuffer; + +import static org.apache.parquet.schema.PrimitiveComparator.BOOLEAN_COMPARATOR; +import static org.apache.parquet.schema.PrimitiveComparator.DOUBLE_COMPARATOR; +import static org.apache.parquet.schema.PrimitiveComparator.FLOAT_COMPARATOR; +import static org.apache.parquet.schema.PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR; +import static org.apache.parquet.schema.PrimitiveComparator.BINARY_AS_SIGNED_INTEGER_COMPARATOR; +import static org.apache.parquet.schema.PrimitiveComparator.SIGNED_INT32_COMPARATOR; +import static org.apache.parquet.schema.PrimitiveComparator.SIGNED_INT64_COMPARATOR; +import static org.apache.parquet.schema.PrimitiveComparator.UNSIGNED_INT32_COMPARATOR; +import static org.apache.parquet.schema.PrimitiveComparator.UNSIGNED_INT64_COMPARATOR; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +/* + * This test verifies all the PrimitiveComparator implementations. The logic of all tests is the same: list the + * elements to be tested in ascending order and then compare every elements to each other (including the element + * itself) and expect the related value based on the defined order. + */ +public class TestPrimitiveComparator { + + @Test + public void testBooleanComparator() { + Boolean[] valuesInAscendingOrder = { null, false, true }; + + for (int i = 0; i < valuesInAscendingOrder.length; ++i) { + for (int j = 0; j < valuesInAscendingOrder.length; ++j) { + Boolean vi = valuesInAscendingOrder[i]; + Boolean vj = valuesInAscendingOrder[j]; + int exp = i - j; + assertSignumEquals(vi, vj, exp, BOOLEAN_COMPARATOR.compare(vi, vj)); + if (vi != null && vj != null) { + assertSignumEquals(vi, vj, exp, BOOLEAN_COMPARATOR.compare(vi.booleanValue(), vj.booleanValue())); + } + } + } + + checkThrowingUnsupportedException(BOOLEAN_COMPARATOR, Boolean.TYPE); + } + + @Test + public void testSignedInt32Comparator() { + testInt32Comparator(SIGNED_INT32_COMPARATOR, + null, + Integer.MIN_VALUE, + -12345, + -1, + 0, + 1, + 12345, + Integer.MAX_VALUE); + } + + @Test + public void testUnsignedInt32Comparator() { + testInt32Comparator(UNSIGNED_INT32_COMPARATOR, + null, + 0, // 0x00000000 + 1, // 0x00000001 + 12345, // 0x00003039 + Integer.MAX_VALUE, // 0x7FFFFFFF + Integer.MIN_VALUE, // 0x80000000 + -12345, // 0xFFFFCFC7 + -1); // 0xFFFFFFFF + } + + private void testInt32Comparator(PrimitiveComparator comparator, Integer... valuesInAscendingOrder) { + for (int i = 0; i < valuesInAscendingOrder.length; ++i) { + for (int j = 0; j < valuesInAscendingOrder.length; ++j) { + Integer vi = valuesInAscendingOrder[i]; + Integer vj = valuesInAscendingOrder[j]; + int exp = i - j; + assertSignumEquals(vi, vj, exp, comparator.compare(vi, vj)); + if (vi != null && vj != null) { + assertSignumEquals(vi, vj, exp, comparator.compare(vi.intValue(), vj.intValue())); + } + } + } + + checkThrowingUnsupportedException(comparator, Integer.TYPE); + } + + @Test + public void testSignedInt64Comparator() { + testInt64Comparator(SIGNED_INT64_COMPARATOR, + null, + Long.MIN_VALUE, + -12345678901L, + -1L, + 0L, + 1L, + 12345678901L, + Long.MAX_VALUE); + } + + @Test + public void testUnsignedInt64Comparator() { + testInt64Comparator(UNSIGNED_INT64_COMPARATOR, + null, + 0L, // 0x0000000000000000 + 1L, // 0x0000000000000001 + 12345678901L, // 0x00000002DFDC1C35 + Long.MAX_VALUE, // 0x7FFFFFFFFFFFFFFF + Long.MIN_VALUE, // 0x8000000000000000 + -12345678901L, // 0xFFFFFFFD2023E3CB + -1L); // 0xFFFFFFFFFFFFFFFF + } + + private void testInt64Comparator(PrimitiveComparator comparator, Long... valuesInAscendingOrder) { + for (int i = 0; i < valuesInAscendingOrder.length; ++i) { + for (int j = 0; j < valuesInAscendingOrder.length; ++j) { + Long vi = valuesInAscendingOrder[i]; + Long vj = valuesInAscendingOrder[j]; + int exp = i - j; + assertSignumEquals(vi, vj, exp, comparator.compare(vi, vj)); + if (vi != null && vj != null) { + assertSignumEquals(vi, vj, exp, comparator.compare(vi.longValue(), vj.longValue())); + } + } + } + + checkThrowingUnsupportedException(comparator, Long.TYPE); + } + + @Test + public void testFloatComparator() { + Float[] valuesInAscendingOrder = { + null, + Float.NEGATIVE_INFINITY, + -Float.MAX_VALUE, + -1234.5678F, + -Float.MIN_VALUE, + 0.0F, + Float.MIN_VALUE, + 1234.5678F, + Float.MAX_VALUE, + Float.POSITIVE_INFINITY }; + + for (int i = 0; i < valuesInAscendingOrder.length; ++i) { + for (int j = 0; j < valuesInAscendingOrder.length; ++j) { + Float vi = valuesInAscendingOrder[i]; + Float vj = valuesInAscendingOrder[j]; + int exp = i - j; + assertSignumEquals(vi, vj, exp, FLOAT_COMPARATOR.compare(vi, vj)); + if (vi != null && vj != null) { + assertSignumEquals(vi, vj, exp, FLOAT_COMPARATOR.compare(vi.floatValue(), vj.floatValue())); + } + } + } + + checkThrowingUnsupportedException(FLOAT_COMPARATOR, Float.TYPE); + } + + @Test + public void testDoubleComparator() { + Double[] valuesInAscendingOrder = { + null, + Double.NEGATIVE_INFINITY, + -Double.MAX_VALUE, + -123456.7890123456789, + -Double.MIN_VALUE, + 0.0, + Double.MIN_VALUE, + 123456.7890123456789, + Double.MAX_VALUE, + Double.POSITIVE_INFINITY }; + + for (int i = 0; i < valuesInAscendingOrder.length; ++i) { + for (int j = 0; j < valuesInAscendingOrder.length; ++j) { + Double vi = valuesInAscendingOrder[i]; + Double vj = valuesInAscendingOrder[j]; + int exp = i - j; + assertSignumEquals(vi, vj, exp, DOUBLE_COMPARATOR.compare(vi, vj)); + if (vi != null && vj != null) { + assertSignumEquals(vi, vj, exp, DOUBLE_COMPARATOR.compare(vi.doubleValue(), vj.doubleValue())); + } + } + } + + checkThrowingUnsupportedException(DOUBLE_COMPARATOR, Double.TYPE); + } + + @Test + public void testLexicographicalBinaryComparator() { + testObjectComparator(UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR, + null, + Binary.fromConstantByteArray(new byte[0]), // || + Binary.fromConstantByteArray(new byte[] { 127, 127, 0, 127 }, 2, 1), // |00| + Binary.fromCharSequence("aaa"), // |61|61|61| + Binary.fromString("aaaa"), // |61|61|61|61| + Binary.fromReusedByteArray("aaab".getBytes()), // |61|61|61|62| + Binary.fromReusedByteArray("azzza".getBytes(), 1, 3), // |7A|7A|7A| + Binary.fromReusedByteBuffer(ByteBuffer.wrap("zzzzzz".getBytes())), // |7A|7A|7A|7A|7A|7A| + Binary.fromReusedByteBuffer(ByteBuffer.wrap("aazzzzzzaa".getBytes(), 2, 7)), // |7A|7A|7A|7A|7A|7A|61| + Binary.fromConstantByteBuffer(ByteBuffer.wrap(new byte[] { -128, -128, -128 })), // |80|80|80| + Binary.fromConstantByteBuffer(ByteBuffer.wrap(new byte[] { -128, -128, -1 }, 1, 2)) // |80|FF| + ); + } + + @Test + public void testBinaryAsSignedIntegerComparator() { + testObjectComparator(BINARY_AS_SIGNED_INTEGER_COMPARATOR, + null, + Binary.fromConstantByteArray(new BigInteger("-9999999999999999999999999999999999999999").toByteArray()), + Binary.fromReusedByteArray(new BigInteger("-9999999999999999999999999999999999999998").toByteArray()), + Binary.fromConstantByteArray(BigInteger.valueOf(Long.MIN_VALUE).subtract(BigInteger.ONE).toByteArray()), + Binary.fromConstantByteArray(BigInteger.valueOf(Long.MIN_VALUE).toByteArray()), + Binary.fromConstantByteArray(BigInteger.valueOf(Long.MIN_VALUE).add(BigInteger.ONE).toByteArray()), + Binary.fromReusedByteArray(new byte[] { (byte) 0xFF, (byte) 0xFF, (byte) 0xFF, -2 }, 1, 3), + Binary.fromReusedByteArray(new BigInteger("-1").toByteArray()), + Binary.fromConstantByteBuffer(ByteBuffer.wrap(new BigInteger("0").toByteArray())), + Binary.fromReusedByteBuffer(ByteBuffer.wrap(new byte[] { 0, 0, 0, 1 })), + Binary.fromConstantByteBuffer(ByteBuffer.wrap(new byte[] { 0, 0, 0, 2 }), 2, 2), + Binary.fromConstantByteBuffer( + ByteBuffer.wrap(BigInteger.valueOf(Long.MAX_VALUE).subtract(BigInteger.ONE).toByteArray())), + Binary.fromConstantByteBuffer(ByteBuffer.wrap(BigInteger.valueOf(Long.MAX_VALUE).toByteArray())), + Binary + .fromConstantByteBuffer( + ByteBuffer.wrap(BigInteger.valueOf(Long.MAX_VALUE).add(BigInteger.ONE).toByteArray())), + Binary.fromConstantByteBuffer( + ByteBuffer.wrap(new BigInteger("999999999999999999999999999999999999999").toByteArray())), + Binary.fromReusedByteBuffer( + ByteBuffer.wrap(new BigInteger("9999999999999999999999999999999999999998").toByteArray())), + Binary.fromConstantByteBuffer( + ByteBuffer.wrap(new BigInteger("9999999999999999999999999999999999999999").toByteArray()))); + } + + private void testObjectComparator(PrimitiveComparator comparator, T... valuesInAscendingOrder) { + for (int i = 0; i < valuesInAscendingOrder.length; ++i) { + for (int j = 0; j < valuesInAscendingOrder.length; ++j) { + T vi = valuesInAscendingOrder[i]; + T vj = valuesInAscendingOrder[j]; + int exp = i - j; + assertSignumEquals(vi, vj, exp, comparator.compare(vi, vj)); + } + } + + checkThrowingUnsupportedException(comparator, null); + } + + private void assertSignumEquals(T v1, T v2, int expected, int actual) { + String sign = expected < 0 ? " < " : expected > 0 ? " > " : " = "; + assertEquals("expected: " + v1 + sign + v2, signum(expected), signum(actual)); + } + + private int signum(int i) { + return i < 0 ? -1 : i > 0 ? 1 : 0; + } + + private void checkThrowingUnsupportedException(PrimitiveComparator comparator, Class exclude) { + if (Integer.TYPE != exclude) { + try { + comparator.compare(0, 0); + fail("An UnsupportedOperationException should have been thrown"); + } catch (UnsupportedOperationException e) { + } + } + if (Long.TYPE != exclude) { + try { + comparator.compare(0L, 0L); + fail("An UnsupportedOperationException should have been thrown"); + } catch (UnsupportedOperationException e) { + } + } + if (Float.TYPE != exclude) { + try { + comparator.compare(0.0F, 0.0F); + fail("An UnsupportedOperationException should have been thrown"); + } catch (UnsupportedOperationException e) { + } + } + if (Double.TYPE != exclude) { + try { + comparator.compare(0.0, 0.0); + fail("An UnsupportedOperationException should have been thrown"); + } catch (UnsupportedOperationException e) { + } + } + if (Boolean.TYPE != exclude) { + try { + comparator.compare(false, false); + fail("An UnsupportedOperationException should have been thrown"); + } catch (UnsupportedOperationException e) { + } + } + } +} diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java index 0c39ef2ba1..0b1f41a59c 100644 --- a/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java +++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestTypeBuilders.java @@ -24,6 +24,7 @@ import org.junit.Assert; import org.junit.Test; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Type.Repetition; import static org.apache.parquet.schema.OriginalType.*; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*; @@ -1348,6 +1349,52 @@ public void testOptionalMapWithinList() { Assert.assertEquals(expected, actual); } + @Test + public void testTypeConstructionWithUndefinedColumnOrder() { + PrimitiveTypeName[] types = new PrimitiveTypeName[] { + BOOLEAN, INT32, INT64, INT96, FLOAT, DOUBLE, BINARY, FIXED_LEN_BYTE_ARRAY + }; + for (PrimitiveTypeName type : types) { + String name = type.toString() + "_"; + int len = type == FIXED_LEN_BYTE_ARRAY ? 42 : 0; + PrimitiveType expected = new PrimitiveType(Repetition.OPTIONAL, type, len, name, null, null, null, + ColumnOrder.undefined()); + PrimitiveType built = Types.optional(type).length(len).columnOrder(ColumnOrder.undefined()).named(name); + Assert.assertEquals(expected, built); + } + } + + @Test + public void testTypeConstructionWithTypeDefinedColumnOrder() { + PrimitiveTypeName[] types = new PrimitiveTypeName[] { + BOOLEAN, INT32, INT64, FLOAT, DOUBLE, BINARY, FIXED_LEN_BYTE_ARRAY + }; + for (PrimitiveTypeName type : types) { + String name = type.toString() + "_"; + int len = type == FIXED_LEN_BYTE_ARRAY ? 42 : 0; + PrimitiveType expected = new PrimitiveType(Repetition.OPTIONAL, type, len, name, null, null, null, + ColumnOrder.typeDefined()); + PrimitiveType built = Types.optional(type).length(len).columnOrder(ColumnOrder.typeDefined()).named(name); + Assert.assertEquals(expected, built); + } + } + + @Test + public void testTypeConstructionWithUnsupportedColumnOrder() { + assertThrows(null, IllegalArgumentException.class, new Callable() { + @Override + public PrimitiveType call() { + return Types.optional(INT96).columnOrder(ColumnOrder.typeDefined()).named("int96_unsupported"); + } + }); + assertThrows(null, IllegalArgumentException.class, new Callable() { + @Override + public PrimitiveType call() { + return Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(12).as(INTERVAL) + .columnOrder(ColumnOrder.typeDefined()).named("interval_unsupported"); + } + }); + } /** * A convenience method to avoid a large number of @Test(expected=...) tests diff --git a/parquet-generator/src/main/java/org/apache/parquet/filter2/IncrementallyUpdatedFilterPredicateGenerator.java b/parquet-generator/src/main/java/org/apache/parquet/filter2/IncrementallyUpdatedFilterPredicateGenerator.java index 1dfaf6f03d..fc5413e11a 100644 --- a/parquet-generator/src/main/java/org/apache/parquet/filter2/IncrementallyUpdatedFilterPredicateGenerator.java +++ b/parquet-generator/src/main/java/org/apache/parquet/filter2/IncrementallyUpdatedFilterPredicateGenerator.java @@ -45,28 +45,28 @@ public IncrementallyUpdatedFilterPredicateGenerator(File file) throws IOExceptio private static class TypeInfo { public final String className; public final String primitiveName; - public final boolean useComparable; public final boolean supportsInequality; - private TypeInfo(String className, String primitiveName, boolean useComparable, boolean supportsInequality) { + private TypeInfo(String className, String primitiveName, boolean supportsInequality) { this.className = className; this.primitiveName = primitiveName; - this.useComparable = useComparable; this.supportsInequality = supportsInequality; } } private static final TypeInfo[] TYPES = new TypeInfo[]{ - new TypeInfo("Integer", "int", false, true), - new TypeInfo("Long", "long", false, true), - new TypeInfo("Boolean", "boolean", false, false), - new TypeInfo("Float", "float", false, true), - new TypeInfo("Double", "double", false, true), - new TypeInfo("Binary", "Binary", true, true), + new TypeInfo("Integer", "int", true), + new TypeInfo("Long", "long", true), + new TypeInfo("Boolean", "boolean", false), + new TypeInfo("Float", "float", true), + new TypeInfo("Double", "double", true), + new TypeInfo("Binary", "Binary", true), }; public void run() throws IOException { add("package org.apache.parquet.filter2.recordlevel;\n" + + "\n" + + "import java.util.List;\n" + "\n" + "import org.apache.parquet.hadoop.metadata.ColumnPath;\n" + "import org.apache.parquet.filter2.predicate.Operators.Eq;\n" + @@ -79,7 +79,9 @@ public void run() throws IOException { "import org.apache.parquet.filter2.predicate.Operators.UserDefined;\n" + "import org.apache.parquet.filter2.predicate.UserDefinedPredicate;\n" + "import org.apache.parquet.filter2.recordlevel.IncrementallyUpdatedFilterPredicate.ValueInspector;\n" + - "import org.apache.parquet.io.api.Binary;\n\n" + + "import org.apache.parquet.io.api.Binary;\n" + + "import org.apache.parquet.io.PrimitiveColumnIO;\n" + + "import org.apache.parquet.schema.PrimitiveComparator;\n\n" + "/**\n" + " * This class is auto-generated by {@link parquet.filter2.IncrementallyUpdatedFilterPredicateGenerator}\n" + " * Do not manually edit!\n" + @@ -88,6 +90,10 @@ public void run() throws IOException { add("public class IncrementallyUpdatedFilterPredicateBuilder extends IncrementallyUpdatedFilterPredicateBuilderBase {\n\n"); + add(" public IncrementallyUpdatedFilterPredicateBuilder(List leaves) {\n" + + " super(leaves);\n" + + " }\n\n"); + addVisitBegin("Eq"); for (TypeInfo info : TYPES) { addEqNotEqCase(info, true); @@ -180,6 +186,7 @@ private void addEqNotEqCase(TypeInfo info, boolean isEq) throws IOException { " };\n" + " } else {\n" + " final " + info.primitiveName + " target = (" + info.className + ") (Object) pred.getValue();\n" + + " final PrimitiveComparator<" + info.className + "> comparator = getComparator(columnPath);\n" + "\n" + " valueInspector = new ValueInspector() {\n" + " @Override\n" + @@ -190,11 +197,7 @@ private void addEqNotEqCase(TypeInfo info, boolean isEq) throws IOException { " @Override\n" + " public void update(" + info.primitiveName + " value) {\n"); - if (info.useComparable) { - add(" setResult(" + compareEquality("value", "target", isEq) + ");\n"); - } else { - add(" setResult(" + (isEq ? "value == target" : "value != target" ) + ");\n"); - } + add(" setResult(" + compareEquality("value", "target", isEq) + ");\n"); add(" }\n" + " };\n" + @@ -212,6 +215,7 @@ private void addInequalityCase(TypeInfo info, String op) throws IOException { add(" if (clazz.equals(" + info.className + ".class)) {\n" + " final " + info.primitiveName + " target = (" + info.className + ") (Object) pred.getValue();\n" + + " final PrimitiveComparator<" + info.className + "> comparator = getComparator(columnPath);\n" + "\n" + " valueInspector = new ValueInspector() {\n" + " @Override\n" + @@ -222,11 +226,8 @@ private void addInequalityCase(TypeInfo info, String op) throws IOException { " @Override\n" + " public void update(" + info.primitiveName + " value) {\n"); - if (info.useComparable) { - add(" setResult(value.compareTo(target) " + op + " 0);\n"); - } else { - add(" setResult(value " + op + " target);\n"); - } + add(" setResult(comparator.compare(value, target) " + op + " 0);\n"); + add(" }\n" + " };\n" + " }\n\n"); @@ -260,7 +261,7 @@ private void addUdpCase(TypeInfo info, boolean invert)throws IOException { } private String compareEquality(String var, String target, boolean eq) { - return var + ".compareTo(" + target + ")" + (eq ? " == 0 " : " != 0"); + return "comparator.compare(" + var + ", " + target + ")" + (eq ? " == 0 " : " != 0"); } private void add(String s) throws IOException { diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java index 19604ec98e..eaba2c1cb8 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java @@ -33,6 +33,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; +import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -75,7 +76,7 @@ private ColumnChunkMetaData getColumnChunk(ColumnPath columnPath) { @SuppressWarnings("unchecked") private > Set expandDictionary(ColumnChunkMetaData meta) throws IOException { - ColumnDescriptor col = new ColumnDescriptor(meta.getPath().toArray(), meta.getType(), -1, -1); + ColumnDescriptor col = new ColumnDescriptor(meta.getPath().toArray(), meta.getPrimitiveType(), -1, -1); DictionaryPage page = dictionaries.readDictionaryPage(col); // the chunk may not be dictionary-encoded @@ -212,8 +213,9 @@ public > Boolean visit(Lt lt) { return BLOCK_MIGHT_MATCH; } + Comparator comparator = meta.getPrimitiveType().comparator(); for (T entry : dictSet) { - if (value.compareTo(entry) > 0) { + if (comparator.compare(value, entry) > 0) { return BLOCK_MIGHT_MATCH; } } @@ -253,8 +255,9 @@ public > Boolean visit(LtEq ltEq) { return BLOCK_MIGHT_MATCH; } + Comparator comparator = meta.getPrimitiveType().comparator(); for (T entry : dictSet) { - if (value.compareTo(entry) >= 0) { + if (comparator.compare(value, entry) >= 0) { return BLOCK_MIGHT_MATCH; } } @@ -292,8 +295,9 @@ public > Boolean visit(Gt gt) { return BLOCK_MIGHT_MATCH; } + Comparator comparator = meta.getPrimitiveType().comparator(); for (T entry : dictSet) { - if (value.compareTo(entry) < 0) { + if (comparator.compare(value, entry) < 0) { return BLOCK_MIGHT_MATCH; } } @@ -333,8 +337,9 @@ public > Boolean visit(GtEq gtEq) { return BLOCK_MIGHT_MATCH; } + Comparator comparator = meta.getPrimitiveType().comparator(); for (T entry : dictSet) { - if (value.compareTo(entry) <= 0) { + if (comparator.compare(value, entry) <= 0) { return BLOCK_MIGHT_MATCH; } } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java index ac7132e74e..f168a6004c 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java @@ -134,7 +134,7 @@ public > Boolean visit(Eq eq) { } // drop if value < min || value > max - return value.compareTo(stats.genericGetMin()) < 0 || value.compareTo(stats.genericGetMax()) > 0; + return stats.compareMinToValue(value) > 0 || stats.compareMaxToValue(value) < 0; } @Override @@ -173,7 +173,7 @@ public > Boolean visit(NotEq notEq) { } // drop if this is a column where min = max = value - return value.compareTo(stats.genericGetMin()) == 0 && value.compareTo(stats.genericGetMax()) == 0; + return stats.compareMinToValue(value) == 0 && stats.compareMaxToValue(value) == 0; } @Override @@ -204,7 +204,7 @@ public > Boolean visit(Lt lt) { T value = lt.getValue(); // drop if value <= min - return value.compareTo(stats.genericGetMin()) <= 0; + return stats.compareMinToValue(value) >= 0; } @Override @@ -235,7 +235,7 @@ public > Boolean visit(LtEq ltEq) { T value = ltEq.getValue(); // drop if value < min - return value.compareTo(stats.genericGetMin()) < 0; + return stats.compareMinToValue(value) > 0; } @Override @@ -266,7 +266,7 @@ public > Boolean visit(Gt gt) { T value = gt.getValue(); // drop if value >= max - return value.compareTo(stats.genericGetMax()) >= 0; + return stats.compareMaxToValue(value) <= 0; } @Override @@ -296,8 +296,8 @@ public > Boolean visit(GtEq gtEq) { T value = gtEq.getValue(); - // drop if value >= max - return value.compareTo(stats.genericGetMax()) > 0; + // drop if value > max + return stats.compareMaxToValue(value) < 0; } @Override @@ -356,7 +356,8 @@ private , U extends UserDefinedPredicate> Boolean vis } org.apache.parquet.filter2.predicate.Statistics udpStats = - new org.apache.parquet.filter2.predicate.Statistics(stats.genericGetMin(), stats.genericGetMax()); + new org.apache.parquet.filter2.predicate.Statistics(stats.genericGetMin(), stats.genericGetMax(), + stats.comparator()); if (inverted) { return udp.inverseCanDrop(udpStats); diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 163056c4dc..ef59760640 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -44,6 +44,7 @@ import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.format.ColumnChunk; import org.apache.parquet.format.ColumnMetaData; +import org.apache.parquet.format.ColumnOrder; import org.apache.parquet.format.ConvertedType; import org.apache.parquet.format.DataPageHeader; import org.apache.parquet.format.DataPageHeaderV2; @@ -58,12 +59,14 @@ import org.apache.parquet.format.SchemaElement; import org.apache.parquet.format.Statistics; import org.apache.parquet.format.Type; +import org.apache.parquet.format.TypeDefinedOrder; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.column.EncodingStats; import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.io.ParquetDecodingException; +import org.apache.parquet.schema.ColumnOrder.ColumnOrderName; import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.OriginalType; @@ -79,6 +82,7 @@ // TODO: Lets split it up: https://issues.apache.org/jira/browse/PARQUET-310 public class ParquetMetadataConverter { + private static final TypeDefinedOrder TYPE_DEFINED_ORDER = new TypeDefinedOrder(); public static final MetadataFilter NO_FILTER = new NoFilter(); public static final MetadataFilter SKIP_ROW_GROUPS = new SkipMetadataFilter(); public static final long MAX_STATS_SIZE = 4096; // limit stats to 4k @@ -135,9 +139,24 @@ public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parque } fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy()); + + fileMetaData.setColumn_orders(getColumnOrders(parquetMetadata.getFileMetaData().getSchema())); + return fileMetaData; } + private List getColumnOrders(MessageType schema) { + List columnOrders = new ArrayList<>(); + // Currently, only TypeDefinedOrder is supported, so we create a column order for each columns with + // TypeDefinedOrder even if some types (e.g. INT96) have undefined column orders. + for (int i = 0, n = schema.getPaths().size(); i < n; ++i) { + ColumnOrder columnOrder = new ColumnOrder(); + columnOrder.setTYPE_ORDER(TYPE_DEFINED_ORDER); + columnOrders.add(columnOrder); + } + return columnOrders; + } + // Visible for testing List toParquetSchema(MessageType schema) { List result = new ArrayList(); @@ -326,20 +345,37 @@ dataPageType, getEncoding(encoding), } public static Statistics toParquetStatistics( - org.apache.parquet.column.statistics.Statistics statistics) { - Statistics stats = new Statistics(); + org.apache.parquet.column.statistics.Statistics stats) { + Statistics formatStats = new Statistics(); // Don't write stats larger than the max size rather than truncating. The // rationale is that some engines may use the minimum value in the page as // the true minimum for aggregations and there is no way to mark that a // value has been truncated and is a lower bound and not in the page. - if (!statistics.isEmpty() && statistics.isSmallerThan(MAX_STATS_SIZE)) { - stats.setNull_count(statistics.getNumNulls()); - if (statistics.hasNonNullValue()) { - stats.setMax(statistics.getMaxBytes()); - stats.setMin(statistics.getMinBytes()); + if (!stats.isEmpty() && stats.isSmallerThan(MAX_STATS_SIZE)) { + formatStats.setNull_count(stats.getNumNulls()); + if (stats.hasNonNullValue()) { + byte[] min = stats.getMinBytes(); + byte[] max = stats.getMaxBytes(); + + // Fill the former min-max statistics only if the comparison logic is + // signed so the logic of V1 and V2 stats are the same (which is + // trivially true for equal min-max values) + if (sortOrder(stats.type()) == SortOrder.SIGNED || Arrays.equals(min, max)) { + formatStats.setMin(min); + formatStats.setMax(max); + } + + if (isMinMaxStatsSupported(stats.type()) || Arrays.equals(min, max)) { + formatStats.setMin_value(min); + formatStats.setMax_value(max); + } } } - return stats; + return formatStats; + } + + private static boolean isMinMaxStatsSupported(PrimitiveType type) { + return type.columnOrder().getColumnOrderName() == ColumnOrderName.TYPE_DEFINED_ORDER; } /** @@ -357,29 +393,42 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist @Deprecated public static org.apache.parquet.column.statistics.Statistics fromParquetStatistics (String createdBy, Statistics statistics, PrimitiveTypeName type) { - return fromParquetStatisticsInternal(createdBy, statistics, type, defaultSortOrder(type)); + return fromParquetStatisticsInternal(createdBy, statistics, + new PrimitiveType(Repetition.OPTIONAL, type, "fake_type"), defaultSortOrder(type)); } // Visible for testing static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal - (String createdBy, Statistics statistics, PrimitiveTypeName type, SortOrder typeSortOrder) { + (String createdBy, Statistics formatStats, PrimitiveType type, SortOrder typeSortOrder) { // create stats object based on the column type - org.apache.parquet.column.statistics.Statistics stats = org.apache.parquet.column.statistics.Statistics.getStatsBasedOnType(type); - // If there was no statistics written to the footer, create an empty Statistics object and return - - boolean isSet = statistics != null && statistics.isSetMax() && statistics.isSetMin(); - boolean maxEqualsMin = isSet ? Arrays.equals(statistics.getMin(), statistics.getMax()) : false; - boolean sortOrdersMatch = SortOrder.SIGNED == typeSortOrder; - // NOTE: See docs in CorruptStatistics for explanation of why this check is needed - // The sort order is checked to avoid returning min/max stats that are not - // valid with the type's sort order. Currently, all stats are aggregated - // using a signed ordering, which isn't valid for strings or unsigned ints. - if (statistics != null && !CorruptStatistics.shouldIgnoreStatistics(createdBy, type) && - ( sortOrdersMatch || maxEqualsMin)) { - if (isSet) { - stats.setMinMaxFromBytes(statistics.min.array(), statistics.max.array()); + org.apache.parquet.column.statistics.Statistics stats = org.apache.parquet.column.statistics.Statistics.createStats(type); + + if (formatStats != null) { + // Use the new V2 min-max statistics over the former one if it is filled + if (formatStats.isSetMin_value() && formatStats.isSetMax_value()) { + byte[] min = formatStats.min_value.array(); + byte[] max = formatStats.max_value.array(); + if (isMinMaxStatsSupported(type) || Arrays.equals(min, max)) { + stats.setMinMaxFromBytes(min, max); + } + stats.setNumNulls(formatStats.null_count); + } else { + boolean isSet = formatStats.isSetMax() && formatStats.isSetMin(); + boolean maxEqualsMin = isSet ? Arrays.equals(formatStats.getMin(), formatStats.getMax()) : false; + boolean sortOrdersMatch = SortOrder.SIGNED == typeSortOrder; + // NOTE: See docs in CorruptStatistics for explanation of why this check is needed + // The sort order is checked to avoid returning min/max stats that are not + // valid with the type's sort order. In previous releases, all stats were + // aggregated using a signed byte-wise ordering, which isn't valid for all the + // types (e.g. strings, decimals etc.). + if (!CorruptStatistics.shouldIgnoreStatistics(createdBy, type.getPrimitiveTypeName()) && + (sortOrdersMatch || maxEqualsMin)) { + if (isSet) { + stats.setMinMaxFromBytes(formatStats.min.array(), formatStats.max.array()); + } + stats.setNumNulls(formatStats.null_count); + } } - stats.setNumNulls(statistics.null_count); } return stats; } @@ -389,7 +438,7 @@ public org.apache.parquet.column.statistics.Statistics fromParquetStatistics( SortOrder expectedOrder = overrideSortOrderToSigned(type) ? SortOrder.SIGNED : sortOrder(type); return fromParquetStatisticsInternal( - createdBy, statistics, type.getPrimitiveTypeName(), expectedOrder); + createdBy, statistics, type, expectedOrder); } /** @@ -827,7 +876,7 @@ public FileMetaData visit(RangeMetadataFilter filter) throws IOException { } public ParquetMetadata fromParquetMetadata(FileMetaData parquetMetadata) throws IOException { - MessageType messageType = fromParquetSchema(parquetMetadata.getSchema()); + MessageType messageType = fromParquetSchema(parquetMetadata.getSchema(), parquetMetadata.getColumn_orders()); List blocks = new ArrayList(); List row_groups = parquetMetadata.getRow_groups(); if (row_groups != null) { @@ -846,7 +895,7 @@ public ParquetMetadata fromParquetMetadata(FileMetaData parquetMetadata) throws ColumnPath path = getPath(metaData); ColumnChunkMetaData column = ColumnChunkMetaData.get( path, - messageType.getType(path.toArray()).asPrimitiveType().getPrimitiveTypeName(), + messageType.getType(path.toArray()).asPrimitiveType(), fromFormatCodec(metaData.codec), convertEncodingStats(metaData.getEncoding_stats()), fromFormatEncodings(metaData.encodings), @@ -886,20 +935,22 @@ private static ColumnPath getPath(ColumnMetaData metaData) { } // Visible for testing - MessageType fromParquetSchema(List schema) { + MessageType fromParquetSchema(List schema, List columnOrders) { Iterator iterator = schema.iterator(); SchemaElement root = iterator.next(); Types.MessageTypeBuilder builder = Types.buildMessage(); if (root.isSetField_id()) { builder.id(root.field_id); } - buildChildren(builder, iterator, root.getNum_children()); + buildChildren(builder, iterator, root.getNum_children(), columnOrders, 0); return builder.named(root.name); } private void buildChildren(Types.GroupBuilder builder, Iterator schema, - int childrenCount) { + int childrenCount, + List columnOrders, + int columnCount) { for (int i = 0; i < childrenCount; i++) { SchemaElement schemaElement = schema.next(); @@ -918,11 +969,21 @@ private void buildChildren(Types.GroupBuilder builder, if (schemaElement.isSetScale()) { primitiveBuilder.scale(schemaElement.scale); } + if (columnOrders != null) { + org.apache.parquet.schema.ColumnOrder columnOrder = fromParquetColumnOrder(columnOrders.get(columnCount)); + // As per parquet format 2.4.0 no UNDEFINED order is supported. So, set undefined column order for the types + // where ordering is not supported. + if (columnOrder.getColumnOrderName() == ColumnOrderName.TYPE_DEFINED_ORDER + && (schemaElement.type == Type.INT96 || schemaElement.converted_type == ConvertedType.INTERVAL)) { + columnOrder = org.apache.parquet.schema.ColumnOrder.undefined(); + } + primitiveBuilder.columnOrder(columnOrder); + } childBuilder = primitiveBuilder; } else { childBuilder = builder.group(fromParquetRepetition(schemaElement.repetition_type)); - buildChildren((Types.GroupBuilder) childBuilder, schema, schemaElement.num_children); + buildChildren((Types.GroupBuilder) childBuilder, schema, schemaElement.num_children, columnOrders, columnCount); } if (schemaElement.isSetConverted_type()) { @@ -933,6 +994,7 @@ private void buildChildren(Types.GroupBuilder builder, } childBuilder.named(schemaElement.name); + ++columnCount; } } @@ -946,6 +1008,14 @@ Repetition fromParquetRepetition(FieldRepetitionType repetition) { return Repetition.valueOf(repetition.name()); } + private static org.apache.parquet.schema.ColumnOrder fromParquetColumnOrder(ColumnOrder columnOrder) { + if (columnOrder.isSetTYPE_ORDER()) { + return org.apache.parquet.schema.ColumnOrder.typeDefined(); + } + // The column order is not yet supported by this API + return org.apache.parquet.schema.ColumnOrder.undefined(); + } + @Deprecated public void writeDataPageHeader( int uncompressedSize, @@ -994,8 +1064,7 @@ private PageHeader newDataPageHeader( getEncoding(dlEncoding), getEncoding(rlEncoding))); if (!statistics.isEmpty()) { - pageHeader.getData_page_header().setStatistics( - toParquetStatistics(statistics)); + pageHeader.getData_page_header().setStatistics(toParquetStatistics(statistics)); } return pageHeader; } diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java index ac3cd3b8b2..82c288fe43 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java @@ -18,8 +18,6 @@ */ package org.apache.parquet.hadoop; -import static org.apache.parquet.column.statistics.Statistics.getStatsBasedOnType; - import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.ArrayList; @@ -79,7 +77,6 @@ private ColumnChunkPageWriter(ColumnDescriptor path, this.compressor = compressor; this.allocator = allocator; this.buf = new ConcatenatingByteArrayCollector(); - this.totalStatistics = getStatsBasedOnType(this.path.getType()); } @Override @@ -116,7 +113,14 @@ public void writePage(BytesInput bytes, this.compressedLength += compressedSize; this.totalValueCount += valueCount; this.pageCount += 1; - this.totalStatistics.mergeStatistics(statistics); + + // Copying the statistics if it is not initialized yet so we have the correct typed one + if (totalStatistics == null) { + totalStatistics = statistics.copy(); + } else { + totalStatistics.mergeStatistics(statistics); + } + // by concatenating before collecting instead of collecting twice, // we only allocate one buffer to copy into instead of multiple. buf.collect(BytesInput.concat(BytesInput.from(tempOutputStream), compressedBytes)); @@ -154,7 +158,13 @@ public void writePageV2( this.compressedLength += compressedSize; this.totalValueCount += valueCount; this.pageCount += 1; - this.totalStatistics.mergeStatistics(statistics); + + // Copying the statistics if it is not initialized yet so we have the correct typed one + if (totalStatistics == null) { + totalStatistics = statistics.copy(); + } else { + totalStatistics.mergeStatistics(statistics); + } // by concatenating before collecting instead of collecting twice, // we only allocate one buffer to copy into instead of multiple. diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java index da8635d099..285c2db1a4 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java @@ -65,7 +65,7 @@ import org.apache.parquet.io.ParquetEncodingException; import org.apache.parquet.io.PositionOutputStream; import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.TypeUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -116,7 +116,7 @@ public static enum Mode { // column chunk data set at the start of a column private CompressionCodecName currentChunkCodec; // set in startColumn private ColumnPath currentChunkPath; // set in startColumn - private PrimitiveTypeName currentChunkType; // set in startColumn + private PrimitiveType currentChunkType; // set in startColumn private long currentChunkValueCount; // set in startColumn private long currentChunkFirstDataPage; // set in startColumn (out.pos()) private long currentChunkDictionaryPageOffset; // set in writeDictionaryPage @@ -317,15 +317,14 @@ public void startColumn(ColumnDescriptor descriptor, encodingStatsBuilder.clear(); currentEncodings = new HashSet(); currentChunkPath = ColumnPath.get(descriptor.getPath()); - currentChunkType = descriptor.getType(); + currentChunkType = descriptor.getPrimitiveType(); currentChunkCodec = compressionCodecName; currentChunkValueCount = valueCount; currentChunkFirstDataPage = out.getPos(); compressedLength = 0; uncompressedLength = 0; - // need to know what type of stats to initialize to - // better way to do this? - currentStatistics = Statistics.getStatsBasedOnType(currentChunkType); + // The statistics will be copied from the first one added at writeDataPage(s) so we have the correct typed one + currentStatistics = null; } /** @@ -425,7 +424,14 @@ public void writeDataPage( this.compressedLength += compressedPageSize + headerSize; LOG.debug("{}: write data page content {}", out.getPos(), compressedPageSize); bytes.writeAllTo(out); - currentStatistics.mergeStatistics(statistics); + + // Copying the statistics if it is not initialized yet so we have the correct typed one + if (currentStatistics == null) { + currentStatistics = statistics.copy(); + } else { + currentStatistics.mergeStatistics(statistics); + } + encodingStatsBuilder.addDataEncoding(valuesEncoding); currentEncodings.add(rlEncoding); currentEncodings.add(dlEncoding); @@ -599,7 +605,7 @@ public void appendRowGroup(SeekableInputStream from, BlockMetaData rowGroup, currentBlock.addColumn(ColumnChunkMetaData.get( chunk.getPath(), - chunk.getType(), + chunk.getPrimitiveType(), chunk.getCodec(), chunk.getEncodingStats(), chunk.getEncodings(), diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java index 720bd77924..e1986986f7 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkMetaData.java @@ -24,7 +24,9 @@ import org.apache.parquet.column.EncodingStats; import org.apache.parquet.column.statistics.BooleanStatistics; import org.apache.parquet.column.statistics.Statistics; +import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Types; /** * Column meta data for a block stored in the file footer and passed in the InputSplit @@ -65,6 +67,12 @@ public static ColumnChunkMetaData get( valueCount, totalSize, totalUncompressedSize); } + /** + * @deprecated will be removed in 2.0.0. Use + * {@link #get(ColumnPath, PrimitiveType, CompressionCodecName, EncodingStats, Set, Statistics, long, long, long, long, long)} + * instead. + */ + @Deprecated public static ColumnChunkMetaData get( ColumnPath path, PrimitiveTypeName type, @@ -77,6 +85,22 @@ public static ColumnChunkMetaData get( long valueCount, long totalSize, long totalUncompressedSize) { + return get(path, Types.optional(type).named("fake_type"), codec, encodingStats, encodings, statistics, + firstDataPage, dictionaryPageOffset, valueCount, totalSize, totalUncompressedSize); + } + + public static ColumnChunkMetaData get( + ColumnPath path, + PrimitiveType type, + CompressionCodecName codec, + EncodingStats encodingStats, + Set encodings, + Statistics statistics, + long firstDataPage, + long dictionaryPageOffset, + long valueCount, + long totalSize, + long totalUncompressedSize) { // to save space we store those always positive longs in ints when they fit. if (positiveLongFitsInAnInt(firstDataPage) && positiveLongFitsInAnInt(dictionaryPageOffset) @@ -149,18 +173,29 @@ public CompressionCodecName getCodec() { /** * * @return column identifier + * @deprecated will be removed in 2.0.0. Use {@link #getPrimitiveType()} instead. */ + @Deprecated public ColumnPath getPath() { return properties.getPath(); } /** * @return type of the column + * @deprecated will be removed in 2.0.0. Use {@link #getPrimitiveType()} instead. */ + @Deprecated public PrimitiveTypeName getType() { return properties.getType(); } + /** + * @return the primitive type object of the column + */ + public PrimitiveType getPrimitiveType() { + return properties.getPrimitiveType(); + } + /** * @return start of the column data offset */ @@ -231,7 +266,7 @@ class IntColumnChunkMetaData extends ColumnChunkMetaData { */ IntColumnChunkMetaData( ColumnPath path, - PrimitiveTypeName type, + PrimitiveType type, CompressionCodecName codec, EncodingStats encodingStats, Set encodings, @@ -336,7 +371,7 @@ class LongColumnChunkMetaData extends ColumnChunkMetaData { */ LongColumnChunkMetaData( ColumnPath path, - PrimitiveTypeName type, + PrimitiveType type, CompressionCodecName codec, EncodingStats encodingStats, Set encodings, diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkProperties.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkProperties.java index 5e2667501d..233cf94b1a 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkProperties.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/metadata/ColumnChunkProperties.java @@ -22,24 +22,36 @@ import java.util.Set; import org.apache.parquet.column.Encoding; +import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Type; public class ColumnChunkProperties { private static Canonicalizer properties = new Canonicalizer(); + /** + * @deprecated will be removed in 2.0.0. Use {@link #get(ColumnPath, PrimitiveType, CompressionCodecName, Set)} + * instead. + */ + @Deprecated public static ColumnChunkProperties get(ColumnPath path, PrimitiveTypeName type, CompressionCodecName codec, Set encodings) { + return get(path, new PrimitiveType(Type.Repetition.OPTIONAL, type, ""), codec, encodings); + } + + public static ColumnChunkProperties get(ColumnPath path, PrimitiveType type, CompressionCodecName codec, + Set encodings) { return properties.canonicalize(new ColumnChunkProperties(codec, path, type, encodings)); } private final CompressionCodecName codec; private final ColumnPath path; - private final PrimitiveTypeName type; + private final PrimitiveType type; private final Set encodings; private ColumnChunkProperties(CompressionCodecName codec, ColumnPath path, - PrimitiveTypeName type, + PrimitiveType type, Set encodings) { super(); this.codec = codec; @@ -56,7 +68,19 @@ public ColumnPath getPath() { return path; } + /** + * @return the primitive type name for the column + * @deprecated will be removed in 2.0.0. Use {@link #getPrimitiveType()} instead. + */ + @Deprecated public PrimitiveTypeName getType() { + return type.getPrimitiveTypeName(); + } + + /** + * @return the primitive type object for the column + */ + public PrimitiveType getPrimitiveType() { return type; } @@ -68,7 +92,7 @@ public Set getEncodings() { public boolean equals(Object obj) { if (obj instanceof ColumnChunkProperties) { ColumnChunkProperties other = (ColumnChunkProperties)obj; - return other.codec == codec && other.path.equals(path) && other.type == type && equals(other.encodings, encodings); + return other.codec == codec && other.path.equals(path) && other.type.equals(type) && equals(other.encodings, encodings); } return false; } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index 4df45ddc96..ee92d4625b 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -22,7 +22,9 @@ import static org.apache.parquet.format.converter.ParquetMetadataConverter.filterFileMetaDataByStart; import static org.apache.parquet.schema.MessageTypeParser.parseMessageType; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertSame; +import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import static org.apache.parquet.format.CompressionCodec.UNCOMPRESSED; import static org.apache.parquet.format.Type.INT32; @@ -34,6 +36,8 @@ import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.math.BigInteger; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -48,6 +52,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.parquet.Version; import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.statistics.BinaryStatistics; import org.apache.parquet.column.statistics.BooleanStatistics; import org.apache.parquet.column.statistics.DoubleStatistics; @@ -61,9 +66,9 @@ import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.PrimitiveType; import org.junit.Assert; import org.junit.Test; - import org.apache.parquet.example.Paper; import org.apache.parquet.format.ColumnChunk; import org.apache.parquet.format.ColumnMetaData; @@ -75,6 +80,7 @@ import org.apache.parquet.format.RowGroup; import org.apache.parquet.format.SchemaElement; import org.apache.parquet.format.Type; +import org.apache.parquet.schema.ColumnOrder; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; @@ -101,7 +107,7 @@ public void testPageHeader() throws IOException { public void testSchemaConverter() { ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); List parquetSchema = parquetMetadataConverter.toParquetSchema(Paper.schema); - MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema); + MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null); assertEquals(Paper.schema, schema); } @@ -370,7 +376,16 @@ public void testEncodingsCache() { } @Test - public void testBinaryStats() { + public void testBinaryStatsV1() { + testBinaryStats(StatsHelper.V1); + } + + @Test + public void testBinaryStatsV2() { + testBinaryStats(StatsHelper.V2); + } + + private void testBinaryStats(StatsHelper helper) { // make fake stats and verify the size check BinaryStatistics stats = new BinaryStatistics(); stats.incrementNumNulls(3004); @@ -384,33 +399,47 @@ public void testBinaryStats() { Assert.assertTrue("Should be smaller than min + max size + 1", stats.isSmallerThan(totalLen + 1)); - org.apache.parquet.format.Statistics formatStats = - ParquetMetadataConverter.toParquetStatistics(stats); + org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats); - Assert.assertArrayEquals("Min should match", min, formatStats.getMin()); - Assert.assertArrayEquals("Max should match", max, formatStats.getMax()); + assertFalse("Min should not be set", formatStats.isSetMin()); + assertFalse("Max should not be set", formatStats.isSetMax()); + if (helper == StatsHelper.V2) { + Assert.assertArrayEquals("Min_value should match", min, formatStats.getMin_value()); + Assert.assertArrayEquals("Max_value should match", max, formatStats.getMax_value()); + } Assert.assertEquals("Num nulls should match", 3004, formatStats.getNull_count()); // convert to empty stats because the values are too large stats.setMinMaxFromBytes(max, max); - formatStats = ParquetMetadataConverter.toParquetStatistics(stats); + formatStats = helper.toParquetStatistics(stats); Assert.assertFalse("Min should not be set", formatStats.isSetMin()); Assert.assertFalse("Max should not be set", formatStats.isSetMax()); + Assert.assertFalse("Min_value should not be set", formatStats.isSetMin_value()); + Assert.assertFalse("Max_value should not be set", formatStats.isSetMax_value()); Assert.assertFalse("Num nulls should not be set", formatStats.isSetNull_count()); Statistics roundTripStats = ParquetMetadataConverter.fromParquetStatisticsInternal( - Version.FULL_VERSION, formatStats, PrimitiveTypeName.BINARY, + Version.FULL_VERSION, formatStats, new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, ""), ParquetMetadataConverter.SortOrder.SIGNED); Assert.assertTrue(roundTripStats.isEmpty()); } @Test - public void testIntegerStats() { + public void testIntegerStatsV1() { + testIntegerStats(StatsHelper.V1); + } + + @Test + public void testIntegerStatsV2() { + testIntegerStats(StatsHelper.V2); + } + + private void testIntegerStats(StatsHelper helper) { // make fake stats and verify the size check IntStatistics stats = new IntStatistics(); stats.incrementNumNulls(3004); @@ -419,8 +448,7 @@ public void testIntegerStats() { stats.updateStats(min); stats.updateStats(max); - org.apache.parquet.format.Statistics formatStats = - ParquetMetadataConverter.toParquetStatistics(stats); + org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats); Assert.assertEquals("Min should match", min, BytesUtils.bytesToInt(formatStats.getMin())); @@ -431,7 +459,16 @@ public void testIntegerStats() { } @Test - public void testLongStats() { + public void testLongStatsV1() { + testLongStats(StatsHelper.V1); + } + + @Test + public void testLongStatsV2() { + testLongStats(StatsHelper.V2); + } + + private void testLongStats(StatsHelper helper) { // make fake stats and verify the size check LongStatistics stats = new LongStatistics(); stats.incrementNumNulls(3004); @@ -440,8 +477,7 @@ public void testLongStats() { stats.updateStats(min); stats.updateStats(max); - org.apache.parquet.format.Statistics formatStats = - ParquetMetadataConverter.toParquetStatistics(stats); + org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats); Assert.assertEquals("Min should match", min, BytesUtils.bytesToLong(formatStats.getMin())); @@ -452,7 +488,16 @@ public void testLongStats() { } @Test - public void testFloatStats() { + public void testFloatStatsV1() { + testFloatStats(StatsHelper.V1); + } + + @Test + public void testFloatStatsV2() { + testFloatStats(StatsHelper.V2); + } + + private void testFloatStats(StatsHelper helper) { // make fake stats and verify the size check FloatStatistics stats = new FloatStatistics(); stats.incrementNumNulls(3004); @@ -461,8 +506,7 @@ public void testFloatStats() { stats.updateStats(min); stats.updateStats(max); - org.apache.parquet.format.Statistics formatStats = - ParquetMetadataConverter.toParquetStatistics(stats); + org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats); Assert.assertEquals("Min should match", min, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMin())), @@ -475,7 +519,16 @@ public void testFloatStats() { } @Test - public void testDoubleStats() { + public void testDoubleStatsV1() { + testDoubleStats(StatsHelper.V1); + } + + @Test + public void testDoubleStatsV2() { + testDoubleStats(StatsHelper.V2); + } + + private void testDoubleStats(StatsHelper helper) { // make fake stats and verify the size check DoubleStatistics stats = new DoubleStatistics(); stats.incrementNumNulls(3004); @@ -484,8 +537,7 @@ public void testDoubleStats() { stats.updateStats(min); stats.updateStats(max); - org.apache.parquet.format.Statistics formatStats = - ParquetMetadataConverter.toParquetStatistics(stats); + org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats); Assert.assertEquals("Min should match", min, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMin())), @@ -498,7 +550,16 @@ public void testDoubleStats() { } @Test - public void testBooleanStats() { + public void testBooleanStatsV1() { + testBooleanStats(StatsHelper.V1); + } + + @Test + public void testBooleanStatsV2() { + testBooleanStats(StatsHelper.V2); + } + + private void testBooleanStats(StatsHelper helper) { // make fake stats and verify the size check BooleanStatistics stats = new BooleanStatistics(); stats.incrementNumNulls(3004); @@ -507,8 +568,7 @@ public void testBooleanStats() { stats.updateStats(min); stats.updateStats(max); - org.apache.parquet.format.Statistics formatStats = - ParquetMetadataConverter.toParquetStatistics(stats); + org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats); Assert.assertEquals("Min should match", min, BytesUtils.bytesToBool(formatStats.getMin())); @@ -528,17 +588,27 @@ public void testIgnoreStatsWithSignedSortOrder() { stats.updateStats(Binary.fromString("z")); stats.incrementNumNulls(); + PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY) + .as(OriginalType.UTF8).named("b"); Statistics convertedStats = converter.fromParquetStatistics( Version.FULL_VERSION, - ParquetMetadataConverter.toParquetStatistics(stats), - Types.required(PrimitiveTypeName.BINARY) - .as(OriginalType.UTF8).named("b")); + StatsHelper.V1.toParquetStatistics(stats), + binaryType); Assert.assertTrue("Stats should be empty: " + convertedStats, convertedStats.isEmpty()); } @Test - public void testStillUseStatsWithSignedSortOrderIfSingleValue() { + public void testStillUseStatsWithSignedSortOrderIfSingleValueV1() { + testStillUseStatsWithSignedSortOrderIfSingleValue(StatsHelper.V1); + } + + @Test + public void testStillUseStatsWithSignedSortOrderIfSingleValueV2() { + testStillUseStatsWithSignedSortOrderIfSingleValue(StatsHelper.V2); + } + + private void testStillUseStatsWithSignedSortOrderIfSingleValue(StatsHelper helper) { ParquetMetadataConverter converter = new ParquetMetadataConverter(); BinaryStatistics stats = new BinaryStatistics(); stats.incrementNumNulls(); @@ -547,18 +617,27 @@ public void testStillUseStatsWithSignedSortOrderIfSingleValue() { stats.updateStats(Binary.fromString("A")); stats.incrementNumNulls(); + PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("b"); Statistics convertedStats = converter.fromParquetStatistics( Version.FULL_VERSION, ParquetMetadataConverter.toParquetStatistics(stats), - Types.required(PrimitiveTypeName.BINARY) - .as(OriginalType.UTF8).named("b")); + binaryType); Assert.assertFalse("Stats should not be empty: " + convertedStats, convertedStats.isEmpty()); Assert.assertArrayEquals("min == max: " + convertedStats, convertedStats.getMaxBytes(), convertedStats.getMinBytes()); } @Test - public void testUseStatsWithSignedSortOrder() { + public void testUseStatsWithSignedSortOrderV1() { + testUseStatsWithSignedSortOrder(StatsHelper.V1); + } + + @Test + public void testUseStatsWithSignedSortOrderV2() { + testUseStatsWithSignedSortOrder(StatsHelper.V2); + } + + private void testUseStatsWithSignedSortOrder(StatsHelper helper) { // override defaults and use stats that were accumulated using signed order Configuration conf = new Configuration(); conf.setBoolean("parquet.strings.signed-min-max.enabled", true); @@ -571,17 +650,213 @@ public void testUseStatsWithSignedSortOrder() { stats.updateStats(Binary.fromString("z")); stats.incrementNumNulls(); + PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY) + .as(OriginalType.UTF8).named("b"); Statistics convertedStats = converter.fromParquetStatistics( Version.FULL_VERSION, - ParquetMetadataConverter.toParquetStatistics(stats), - Types.required(PrimitiveTypeName.BINARY) - .as(OriginalType.UTF8).named("b")); + helper.toParquetStatistics(stats), + binaryType); Assert.assertFalse("Stats should not be empty", convertedStats.isEmpty()); Assert.assertEquals("Should have 3 nulls", 3, convertedStats.getNumNulls()); - Assert.assertEquals("Should have correct min (unsigned sort)", - Binary.fromString("A"), convertedStats.genericGetMin()); - Assert.assertEquals("Should have correct max (unsigned sort)", - Binary.fromString("z"), convertedStats.genericGetMax()); + if (helper == StatsHelper.V1) { + assertFalse("Min-max should be null for V1 stats", convertedStats.hasNonNullValue()); + } else { + Assert.assertEquals("Should have correct min (unsigned sort)", + Binary.fromString("A"), convertedStats.genericGetMin()); + Assert.assertEquals("Should have correct max (unsigned sort)", + Binary.fromString("z"), convertedStats.genericGetMax()); + } + } + + @Test + public void testSkippedV2Stats() { + testSkippedV2Stats( + Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(12).as(OriginalType.INTERVAL).named(""), + new BigInteger("12345678"), + new BigInteger("12345679")); + testSkippedV2Stats(Types.optional(PrimitiveTypeName.INT96).named(""), + new BigInteger("-75687987"), + new BigInteger("45367657")); + } + + private void testSkippedV2Stats(PrimitiveType type, Object min, Object max) { + Statistics stats = createStats(type, min, max); + org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats); + assertFalse(statistics.isSetMin()); + assertFalse(statistics.isSetMax()); + assertFalse(statistics.isSetMin_value()); + assertFalse(statistics.isSetMax_value()); + } + + @Test + public void testV2OnlyStats() { + testV2OnlyStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_8).named(""), + 0x7F, + 0x80); + testV2OnlyStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_16).named(""), + 0x7FFF, + 0x8000); + testV2OnlyStats(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_32).named(""), + 0x7FFFFFFF, + 0x80000000); + testV2OnlyStats(Types.optional(PrimitiveTypeName.INT64).as(OriginalType.UINT_64).named(""), + 0x7FFFFFFFFFFFFFFFL, + 0x8000000000000000L); + testV2OnlyStats(Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.DECIMAL).precision(6).named(""), + new BigInteger("-765875"), + new BigInteger("876856")); + testV2OnlyStats( + Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(14).as(OriginalType.DECIMAL).precision(7) + .named(""), + new BigInteger("-6769643"), + new BigInteger("9864675")); + } + + private void testV2OnlyStats(PrimitiveType type, Object min, Object max) { + Statistics stats = createStats(type, min, max); + org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats); + assertFalse(statistics.isSetMin()); + assertFalse(statistics.isSetMax()); + assertEquals(ByteBuffer.wrap(stats.getMinBytes()), statistics.min_value); + assertEquals(ByteBuffer.wrap(stats.getMaxBytes()), statistics.max_value); + } + + @Test + public void testV2StatsEqualMinMax() { + testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_8).named(""), + 93, + 93); + testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_16).named(""), + -5892, + -5892); + testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.INT32).as(OriginalType.UINT_32).named(""), + 234998934, + 234998934); + testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.INT64).as(OriginalType.UINT_64).named(""), + -2389943895984985L, + -2389943895984985L); + testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.DECIMAL).precision(6).named(""), + new BigInteger("823749"), + new BigInteger("823749")); + testV2StatsEqualMinMax( + Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(14).as(OriginalType.DECIMAL).precision(7) + .named(""), + new BigInteger("-8752832"), + new BigInteger("-8752832")); + testV2StatsEqualMinMax(Types.optional(PrimitiveTypeName.INT96).named(""), + new BigInteger("81032984"), + new BigInteger("81032984")); + } + + private void testV2StatsEqualMinMax(PrimitiveType type, Object min, Object max) { + Statistics stats = createStats(type, min, max); + org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats); + assertEquals(ByteBuffer.wrap(stats.getMinBytes()), statistics.min); + assertEquals(ByteBuffer.wrap(stats.getMaxBytes()), statistics.max); + assertEquals(ByteBuffer.wrap(stats.getMinBytes()), statistics.min_value); + assertEquals(ByteBuffer.wrap(stats.getMaxBytes()), statistics.max_value); + } + + private static Statistics createStats(PrimitiveType type, T min, T max) { + Class c = min.getClass(); + if (c == Integer.class) { + return createStatsTyped(type, (Integer) min, (Integer) max); + } else if (c == Long.class) { + return createStatsTyped(type, (Long) min, (Long) max); + } else if (c == BigInteger.class) { + return createStatsTyped(type, (BigInteger) min, (BigInteger) max); + } + fail("Not implemented"); + return null; + } + + private static Statistics createStatsTyped(PrimitiveType type, int min, int max) { + Statistics stats = Statistics.createStats(type); + stats.updateStats(max); + stats.updateStats(min); + assertEquals(min, stats.genericGetMin()); + assertEquals(max, stats.genericGetMax()); + return stats; + } + + private static Statistics createStatsTyped(PrimitiveType type, long min, long max) { + Statistics stats = Statistics.createStats(type); + stats.updateStats(max); + stats.updateStats(min); + assertEquals(min, stats.genericGetMin()); + assertEquals(max, stats.genericGetMax()); + return stats; + } + + private static Statistics createStatsTyped(PrimitiveType type, BigInteger min, BigInteger max) { + Statistics stats = Statistics.createStats(type); + Binary minBinary = Binary.fromConstantByteArray(min.toByteArray()); + Binary maxBinary = Binary.fromConstantByteArray(max.toByteArray()); + stats.updateStats(maxBinary); + stats.updateStats(minBinary); + assertEquals(minBinary, stats.genericGetMin()); + assertEquals(maxBinary, stats.genericGetMax()); + return stats; + } + + private enum StatsHelper { + // Only min and max are filled (min_value and max_value are not) + V1() { + @Override + public org.apache.parquet.format.Statistics toParquetStatistics(Statistics stats) { + org.apache.parquet.format.Statistics statistics = ParquetMetadataConverter.toParquetStatistics(stats); + statistics.unsetMin_value(); + statistics.unsetMax_value(); + return statistics; + } + }, + // min_value and max_value are filled (min and max might be filled as well) + V2() { + @Override + public org.apache.parquet.format.Statistics toParquetStatistics(Statistics stats) { + return ParquetMetadataConverter.toParquetStatistics(stats); + } + }; + public abstract org.apache.parquet.format.Statistics toParquetStatistics(Statistics stats); + } + + @Test + public void testColumnOrders() throws IOException { + MessageType schema = parseMessageType("message test {" + + " optional binary binary_col;" // Normal column with type defined column order -> typeDefined + + " optional group map_col (MAP) {" + + " repeated group map (MAP_KEY_VALUE) {" + + " required binary key (UTF8);" // Key to be hacked to have unknown column order -> undefined + + " optional group list_col (LIST) {" + + " repeated group list {" + + " optional int96 array_element;" // INT96 element with type defined column order -> undefined + + " }" + + " }" + + " }" + + " }" + + "}"); + org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData = new org.apache.parquet.hadoop.metadata.FileMetaData( + schema, new HashMap(), null); + ParquetMetadata metadata = new ParquetMetadata(fileMetaData, new ArrayList()); + ParquetMetadataConverter converter = new ParquetMetadataConverter(); + FileMetaData formatMetadata = converter.toParquetMetadata(1, metadata); + + List columnOrders = formatMetadata.getColumn_orders(); + assertEquals(3, columnOrders.size()); + for (org.apache.parquet.format.ColumnOrder columnOrder : columnOrders) { + assertTrue(columnOrder.isSetTYPE_ORDER()); + } + + // Simulate that thrift got a union type that is not in the generated code + // (when the file contains a not-yet-supported column order) + columnOrders.get(1).clear(); + + MessageType resultSchema = converter.fromParquetMetadata(formatMetadata).getFileMetaData().getSchema(); + List columns = resultSchema.getColumns(); + assertEquals(3, columns.size()); + assertEquals(ColumnOrder.typeDefined(), columns.get(0).getPrimitiveType().columnOrder()); + assertEquals(ColumnOrder.undefined(), columns.get(1).getPrimitiveType().columnOrder()); + assertEquals(ColumnOrder.undefined(), columns.get(2).getPrimitiveType().columnOrder()); } } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java index 6915c86ec3..4243e9bd18 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestParquetFileWriter.java @@ -524,12 +524,12 @@ public void testWriteReadStatistics() throws Exception { String str = new String(bsout.getMaxBytes()); String str2 = new String(bsout.getMinBytes()); - assertTrue(((BinaryStatistics)readFooter.getBlocks().get(0).getColumns().get(0).getStatistics()).equals(bs1)); - assertTrue(((LongStatistics)readFooter.getBlocks().get(0).getColumns().get(1).getStatistics()).equals(ls1)); + TestUtils.assertStatsValuesEqual(bs1, readFooter.getBlocks().get(0).getColumns().get(0).getStatistics()); + TestUtils.assertStatsValuesEqual(ls1, readFooter.getBlocks().get(0).getColumns().get(1).getStatistics()); } { // assert stats are correct for the second block - assertTrue(((BinaryStatistics)readFooter.getBlocks().get(1).getColumns().get(0).getStatistics()).equals(bs2)); - assertTrue(((LongStatistics)readFooter.getBlocks().get(1).getColumns().get(1).getStatistics()).equals(ls2)); + TestUtils.assertStatsValuesEqual(bs2, readFooter.getBlocks().get(1).getColumns().get(0).getStatistics()); + TestUtils.assertStatsValuesEqual(ls2, readFooter.getBlocks().get(1).getColumns().get(1).getStatistics()); } } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestUtils.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestUtils.java index e53ac785a0..59b4b62140 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestUtils.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestUtils.java @@ -24,6 +24,8 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.parquet.column.statistics.Statistics; +import org.hamcrest.CoreMatchers; import org.junit.Assert; public class TestUtils { @@ -61,4 +63,23 @@ public static void assertThrows( } } } + + public static void assertStatsValuesEqual(Statistics stats1, Statistics stats2) { + assertStatsValuesEqual(null, stats1, stats2); + } + + // To be used to assert that the values (min, max, num-of-nulls) equals. It might be used in cases when creating + // Statistics object for the proper Type would require too much work/code duplications etc. + public static void assertStatsValuesEqual(String message, Statistics expected, Statistics actual) { + if (expected == actual) { + return; + } + if (expected == null || actual == null) { + Assert.assertEquals(expected, actual); + } + Assert.assertThat(actual, CoreMatchers.instanceOf(expected.getClass())); + Assert.assertArrayEquals(message, expected.getMaxBytes(), actual.getMaxBytes()); + Assert.assertArrayEquals(message, expected.getMinBytes(), actual.getMinBytes()); + Assert.assertEquals(message, expected.getNumNulls(), actual.getNumNulls()); + } } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/RandomValues.java b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/RandomValues.java index cbdd935f29..16db5cbf0d 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/RandomValues.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/RandomValues.java @@ -26,7 +26,7 @@ public class RandomValues { private static final String ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890"; - private static abstract class RandomValueGenerator> { + static abstract class RandomValueGenerator> { private final Random random; protected RandomValueGenerator(long seed) { @@ -37,8 +37,8 @@ public boolean shouldGenerateNull() { return (random.nextInt(10) == 0); } - public int randomInt() { return randomInt(Integer.MAX_VALUE - 1); } - public int randomInt(int maximum) { + public int randomInt() { return random.nextInt(); } + public int randomPositiveInt(int maximum) { // Maximum may be a random number (which may be negative). return random.nextInt(Math.abs(maximum) + 1); } @@ -63,11 +63,11 @@ public BigInteger randomInt96(BigInteger maximum) { } public char randomLetter() { - return ALPHABET.charAt(randomInt() % ALPHABET.length()); + return ALPHABET.charAt(randomPositiveInt(ALPHABET.length() - 1)); } public String randomString(int maxLength) { - return randomFixedLengthString(randomInt(maxLength)); + return randomFixedLengthString(randomPositiveInt(maxLength)); } public String randomFixedLengthString(int length) { @@ -82,7 +82,7 @@ public String randomFixedLengthString(int length) { public abstract T nextValue(); } - private static abstract class RandomBinaryBase> extends RandomValueGenerator { + static abstract class RandomBinaryBase> extends RandomValueGenerator { protected final int bufferLength; protected final byte[] buffer; @@ -103,18 +103,56 @@ public Binary asReusedBinary(byte[] data) { } public static class IntGenerator extends RandomValueGenerator { - private final RandomRange randomRange = new RandomRange(randomInt(), randomInt()); - private final int minimum = randomRange.minimum(); - private final int maximum = randomRange.maximum(); - private final int range = (maximum - minimum); + private final int minimum; + private final int range; public IntGenerator(long seed) { super(seed); + RandomRange randomRange = new RandomRange<>(randomInt(), randomInt()); + this.minimum = randomRange.minimum(); + this.range = (randomRange.maximum() - this.minimum); + } + + public IntGenerator(long seed, int minimum, int maximum) { + super(seed); + RandomRange randomRange = new RandomRange<>(minimum, maximum); + this.minimum = randomRange.minimum(); + this.range = randomRange.maximum() - this.minimum; } @Override public Integer nextValue() { - return (minimum + randomInt(range)); + return (minimum + randomPositiveInt(range)); + } + } + + public static class UIntGenerator extends IntGenerator { + private final int mask; + + public UIntGenerator(long seed, byte minimum, byte maximum) { + super(seed, minimum, maximum); + mask = 0xFF; + } + + public UIntGenerator(long seed, short minimum, short maximum) { + super(seed, minimum, maximum); + mask = 0xFFFF; + } + + @Override + public Integer nextValue() { + return super.nextValue() & mask; + } + } + + public static class UnconstrainedIntGenerator extends RandomValueGenerator { + public UnconstrainedIntGenerator(long seed) { + super(seed); + } + + @Override + public Integer nextValue() { + return randomInt(); } } @@ -134,6 +172,17 @@ public Long nextValue() { } } + public static class UnconstrainedLongGenerator extends RandomValueGenerator { + public UnconstrainedLongGenerator(long seed) { + super(seed); + } + + @Override + public Long nextValue() { + return randomLong(); + } + } + public static class Int96Generator extends RandomBinaryBase { private final RandomRange randomRange = new RandomRange(randomInt96(), randomInt96()); private final BigInteger minimum = randomRange.minimum(); @@ -173,6 +222,17 @@ public Float nextValue() { } } + public static class UnconstrainedFloatGenerator extends RandomValueGenerator { + public UnconstrainedFloatGenerator(long seed) { + super(seed); + } + + @Override + public Float nextValue() { + return randomFloat(); + } + } + public static class DoubleGenerator extends RandomValueGenerator { private final RandomRange randomRange = new RandomRange(randomDouble(), randomDouble()); private final double minimum = randomRange.minimum(); @@ -189,6 +249,17 @@ public Double nextValue() { } } + public static class UnconstrainedDoubleGenerator extends RandomValueGenerator { + public UnconstrainedDoubleGenerator(long seed) { + super(seed); + } + + @Override + public Double nextValue() { + return randomDouble(); + } + } + public static class StringGenerator extends RandomBinaryBase { private static final int MAX_STRING_LENGTH = 16; public StringGenerator(long seed) { @@ -197,7 +268,7 @@ public StringGenerator(long seed) { @Override public String nextValue() { - int stringLength = randomInt(15) + 1; + int stringLength = randomPositiveInt(15) + 1; return randomString(stringLength); } @@ -216,7 +287,7 @@ public BinaryGenerator(long seed) { @Override public Binary nextValue() { // use a random length, but ensure it is at least a few bytes - int length = 5 + randomInt(buffer.length - 5); + int length = 5 + randomPositiveInt(buffer.length - 5); for (int index = 0; index < length; index++) { buffer[index] = (byte) randomInt(); } diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java index d157cc3719..5a5d6d4f25 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/statistics/TestStatistics.java @@ -42,8 +42,13 @@ import org.apache.parquet.io.api.Binary; import org.apache.parquet.io.api.PrimitiveConverter; import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; +import org.apache.parquet.statistics.RandomValues.RandomBinaryBase; +import org.apache.parquet.statistics.RandomValues.RandomValueGenerator; import org.junit.Assert; import org.junit.Rule; import org.junit.Test; @@ -51,7 +56,9 @@ import java.io.File; import java.io.IOException; +import java.math.BigInteger; import java.util.Arrays; +import java.util.Comparator; import java.util.List; import java.util.Random; @@ -59,6 +66,7 @@ import static org.apache.parquet.schema.Type.Repetition.OPTIONAL; import static org.apache.parquet.schema.Type.Repetition.REQUIRED; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.assertEquals; public class TestStatistics { private static final int MEGABYTE = 1 << 20; @@ -163,9 +171,11 @@ private static class StatsValidator> { private final boolean hasNonNull; private final T min; private final T max; + private final Comparator comparator; public StatsValidator(DataPage page) { Statistics stats = getStatisticsFromPageHeader(page); + this.comparator = stats.comparator(); this.hasNonNull = stats.hasNonNullValue(); if (hasNonNull) { this.min = stats.genericGetMin(); @@ -178,8 +188,8 @@ public StatsValidator(DataPage page) { public void validate(T value) { if (hasNonNull) { - assertTrue("min should be <= all values", min.compareTo(value) <= 0); - assertTrue("min should be >= all values", max.compareTo(value) >= 0); + assertTrue("min should be <= all values", comparator.compare(min, value) <= 0); + assertTrue("min should be >= all values", comparator.compare(max, value) >= 0); } } } @@ -280,7 +290,11 @@ public void validate(MessageType schema, PageReadStore store) { private void validateStatsForPage(DataPage page, DictionaryPage dict, ColumnDescriptor desc) { SingletonPageReader reader = new SingletonPageReader(dict, page); PrimitiveConverter converter = getValidatingConverter(page, desc.getType()); - Statistics stats = getStatisticsFromPageHeader(page); + Statistics stats = getStatisticsFromPageHeader(page); + + assertEquals("Statistics does not use the proper comparator", + desc.getPrimitiveType().comparator().getClass(), + stats.comparator().getClass()); if (stats.isEmpty()) { // stats are empty if num nulls = 0 and there are no non-null values @@ -306,8 +320,8 @@ private void validateStatsForPage(DataPage page, DictionaryPage dict, ColumnDesc System.err.println(String.format( "Validated stats min=%s max=%s nulls=%d for page=%s col=%s", - String.valueOf(stats.genericGetMin()), - String.valueOf(stats.genericGetMax()), stats.getNumNulls(), page, + stats.minAsString(), + stats.maxAsString(), stats.getNumNulls(), page, Arrays.toString(desc.getPath()))); } } @@ -315,92 +329,144 @@ private void validateStatsForPage(DataPage page, DictionaryPage dict, ColumnDesc public static class DataContext extends DataGenerationContext.WriteContext { private static final int MAX_TOTAL_ROWS = 1000000; - private final long seed; private final Random random; private final int recordCount; - private final int fixedLength; - private final RandomValues.IntGenerator intGenerator; - private final RandomValues.LongGenerator longGenerator; - private final RandomValues.Int96Generator int96Generator; - private final RandomValues.FloatGenerator floatGenerator; - private final RandomValues.DoubleGenerator doubleGenerator; - private final RandomValues.StringGenerator stringGenerator; - private final RandomValues.BinaryGenerator binaryGenerator; - private final RandomValues.FixedGenerator fixedBinaryGenerator; + private final List> randomGenerators; public DataContext(long seed, File path, int blockSize, int pageSize, boolean enableDictionary, ParquetProperties.WriterVersion version) throws IOException { super(path, buildSchema(seed), blockSize, pageSize, enableDictionary, true, version); - this.seed = seed; this.random = new Random(seed); this.recordCount = random.nextInt(MAX_TOTAL_ROWS); - this.fixedLength = schema.getType("fixed-binary").asPrimitiveType().getTypeLength(); - this.intGenerator = new RandomValues.IntGenerator(random.nextLong()); - this.longGenerator = new RandomValues.LongGenerator(random.nextLong()); - this.int96Generator = new RandomValues.Int96Generator(random.nextLong()); - this.floatGenerator = new RandomValues.FloatGenerator(random.nextLong()); - this.doubleGenerator = new RandomValues.DoubleGenerator(random.nextLong()); - this.stringGenerator = new RandomValues.StringGenerator(random.nextLong()); - this.binaryGenerator = new RandomValues.BinaryGenerator(random.nextLong()); - this.fixedBinaryGenerator = new RandomValues.FixedGenerator(random.nextLong(), fixedLength); + int fixedLength = schema.getType("fixed-binary").asPrimitiveType().getTypeLength(); + + randomGenerators = Arrays.>asList( + new RandomValues.IntGenerator(random.nextLong()), + new RandomValues.LongGenerator(random.nextLong()), + new RandomValues.Int96Generator(random.nextLong()), + new RandomValues.FloatGenerator(random.nextLong()), + new RandomValues.DoubleGenerator(random.nextLong()), + new RandomValues.StringGenerator(random.nextLong()), + new RandomValues.BinaryGenerator(random.nextLong()), + new RandomValues.FixedGenerator(random.nextLong(), fixedLength), + new RandomValues.UnconstrainedIntGenerator(random.nextLong()), + new RandomValues.UnconstrainedLongGenerator(random.nextLong()), + new RandomValues.UnconstrainedFloatGenerator(random.nextLong()), + new RandomValues.UnconstrainedDoubleGenerator(random.nextLong()), + new RandomValues.IntGenerator(random.nextLong(), Byte.MIN_VALUE, Byte.MAX_VALUE), + new RandomValues.UIntGenerator(random.nextLong(), Byte.MIN_VALUE, Byte.MAX_VALUE), + new RandomValues.IntGenerator(random.nextLong(), Short.MIN_VALUE, Short.MAX_VALUE), + new RandomValues.UIntGenerator(random.nextLong(), Short.MIN_VALUE, Short.MAX_VALUE), + new RandomValues.UnconstrainedIntGenerator(random.nextLong()), + new RandomValues.UnconstrainedIntGenerator(random.nextLong()), + new RandomValues.UnconstrainedLongGenerator(random.nextLong()), + new RandomValues.UnconstrainedLongGenerator(random.nextLong()), + new RandomValues.UnconstrainedIntGenerator(random.nextLong()), + new RandomValues.UnconstrainedLongGenerator(random.nextLong()), + new RandomValues.FixedGenerator(random.nextLong(), fixedLength), + new RandomValues.BinaryGenerator(random.nextLong()), + new RandomValues.StringGenerator(random.nextLong()), + new RandomValues.StringGenerator(random.nextLong()), + new RandomValues.StringGenerator(random.nextLong()), + new RandomValues.BinaryGenerator(random.nextLong()), + new RandomValues.IntGenerator(random.nextLong()), + new RandomValues.IntGenerator(random.nextLong()), + new RandomValues.LongGenerator(random.nextLong()), + new RandomValues.LongGenerator(random.nextLong()), + new RandomValues.LongGenerator(random.nextLong()), + new RandomValues.FixedGenerator(random.nextLong(), 12) + ); } private static MessageType buildSchema(long seed) { Random random = new Random(seed); int fixedBinaryLength = random.nextInt(21) + 1; + int fixedPrecision = calculatePrecision(fixedBinaryLength); + int fixedScale = fixedPrecision / 4; + int binaryPrecision = calculatePrecision(16); + int binaryScale = binaryPrecision / 4; return new MessageType("schema", - new PrimitiveType(OPTIONAL, INT32, "i32"), - new PrimitiveType(OPTIONAL, INT64, "i64"), - new PrimitiveType(OPTIONAL, INT96, "i96"), - new PrimitiveType(OPTIONAL, FLOAT, "sngl"), - new PrimitiveType(OPTIONAL, DOUBLE, "dbl"), - new PrimitiveType(OPTIONAL, BINARY, "strings"), - new PrimitiveType(OPTIONAL, BINARY, "binary"), - new PrimitiveType(OPTIONAL, FIXED_LEN_BYTE_ARRAY, fixedBinaryLength, "fixed-binary"), - new PrimitiveType(REQUIRED, INT32, "unconstrained-i32"), - new PrimitiveType(REQUIRED, INT64, "unconstrained-i64"), - new PrimitiveType(REQUIRED, FLOAT, "unconstrained-sngl"), - new PrimitiveType(REQUIRED, DOUBLE, "unconstrained-dbl") + new PrimitiveType(OPTIONAL, INT32, "i32"), + new PrimitiveType(OPTIONAL, INT64, "i64"), + new PrimitiveType(OPTIONAL, INT96, "i96"), + new PrimitiveType(OPTIONAL, FLOAT, "sngl"), + new PrimitiveType(OPTIONAL, DOUBLE, "dbl"), + new PrimitiveType(OPTIONAL, BINARY, "strings"), + new PrimitiveType(OPTIONAL, BINARY, "binary"), + new PrimitiveType(OPTIONAL, FIXED_LEN_BYTE_ARRAY, fixedBinaryLength, "fixed-binary"), + new PrimitiveType(REQUIRED, INT32, "unconstrained-i32"), + new PrimitiveType(REQUIRED, INT64, "unconstrained-i64"), + new PrimitiveType(REQUIRED, FLOAT, "unconstrained-sngl"), + new PrimitiveType(REQUIRED, DOUBLE, "unconstrained-dbl"), + Types.optional(INT32).as(OriginalType.INT_8).named("int8"), + Types.optional(INT32).as(OriginalType.UINT_8).named("uint8"), + Types.optional(INT32).as(OriginalType.INT_16).named("int16"), + Types.optional(INT32).as(OriginalType.UINT_16).named("uint16"), + Types.optional(INT32).as(OriginalType.INT_32).named("int32"), + Types.optional(INT32).as(OriginalType.UINT_32).named("uint32"), + Types.optional(INT64).as(OriginalType.INT_64).named("int64"), + Types.optional(INT64).as(OriginalType.UINT_64).named("uint64"), + Types.optional(INT32).as(OriginalType.DECIMAL).precision(9).scale(2).named("decimal-int32"), + Types.optional(INT64).as(OriginalType.DECIMAL).precision(18).scale(4).named("decimal-int64"), + Types.optional(FIXED_LEN_BYTE_ARRAY).length(fixedBinaryLength).as(OriginalType.DECIMAL) + .precision(fixedPrecision).scale(fixedScale).named("decimal-fixed"), + Types.optional(BINARY).as(OriginalType.DECIMAL).precision(binaryPrecision).scale(binaryScale) + .named("decimal-binary"), + Types.optional(BINARY).as(OriginalType.UTF8).named("utf8"), + Types.optional(BINARY).as(OriginalType.ENUM).named("enum"), + Types.optional(BINARY).as(OriginalType.JSON).named("json"), + Types.optional(BINARY).as(OriginalType.BSON).named("bson"), + Types.optional(INT32).as(OriginalType.DATE).named("date"), + Types.optional(INT32).as(OriginalType.TIME_MILLIS).named("time-millis"), + Types.optional(INT64).as(OriginalType.TIME_MICROS).named("time-micros"), + Types.optional(INT64).as(OriginalType.TIMESTAMP_MILLIS).named("timestamp-millis"), + Types.optional(INT64).as(OriginalType.TIMESTAMP_MICROS).named("timestamp-micros"), + Types.optional(FIXED_LEN_BYTE_ARRAY).length(12).as(OriginalType.INTERVAL).named("interval") ); } + private static int calculatePrecision(int byteCnt) { + String maxValue = BigInteger.valueOf(2L).pow(8 * byteCnt - 1).toString(); + return maxValue.length() - 1; + } + @Override public void write(ParquetWriter writer) throws IOException { for (int index = 0; index < recordCount; index++) { Group group = new SimpleGroup(super.schema); - if (!intGenerator.shouldGenerateNull()) { - group.append("i32", intGenerator.nextValue()); - } - if (!longGenerator.shouldGenerateNull()) { - group.append("i64", longGenerator.nextValue()); - } - if (!int96Generator.shouldGenerateNull()) { - group.append("i96", int96Generator.nextBinaryValue()); - } - if (!floatGenerator.shouldGenerateNull()) { - group.append("sngl", floatGenerator.nextValue()); - } - if (!doubleGenerator.shouldGenerateNull()) { - group.append("dbl", doubleGenerator.nextValue()); - } - if (!stringGenerator.shouldGenerateNull()) { - group.append("strings", stringGenerator.nextBinaryValue()); - } - if (!binaryGenerator.shouldGenerateNull()) { - group.append("binary", binaryGenerator.nextBinaryValue()); - } - if (!fixedBinaryGenerator.shouldGenerateNull()) { - group.append("fixed-binary", fixedBinaryGenerator.nextBinaryValue()); + for (int column = 0, columnCnt = schema.getFieldCount(); column < columnCnt; ++column) { + Type type = schema.getType(column); + RandomValueGenerator generator = randomGenerators.get(column); + if (type.isRepetition(OPTIONAL) && generator.shouldGenerateNull()) { + continue; + } + switch (type.asPrimitiveType().getPrimitiveTypeName()) { + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + case INT96: + group.append(type.getName(), ((RandomBinaryBase) generator).nextBinaryValue()); + break; + case INT32: + group.append(type.getName(), (Integer) generator.nextValue()); + break; + case INT64: + group.append(type.getName(), (Long) generator.nextValue()); + break; + case FLOAT: + group.append(type.getName(), (Float) generator.nextValue()); + break; + case DOUBLE: + group.append(type.getName(), (Double) generator.nextValue()); + break; + case BOOLEAN: + group.append(type.getName(), (Boolean) generator.nextValue()); + break; + } } - group.append("unconstrained-i32", random.nextInt()); - group.append("unconstrained-i64", random.nextLong()); - group.append("unconstrained-sngl", random.nextFloat()); - group.append("unconstrained-dbl", random.nextDouble()); - writer.write(group); } } diff --git a/parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestThriftToParquetFileWriter.java b/parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestThriftToParquetFileWriter.java index 0439686dce..66b804ccb8 100644 --- a/parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestThriftToParquetFileWriter.java +++ b/parquet-thrift/src/test/java/org/apache/parquet/hadoop/thrift/TestThriftToParquetFileWriter.java @@ -19,8 +19,6 @@ package org.apache.parquet.hadoop.thrift; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.Arrays; @@ -53,6 +51,7 @@ import org.apache.parquet.example.data.Group; import org.apache.parquet.hadoop.ParquetFileReader; import org.apache.parquet.hadoop.ParquetReader; +import org.apache.parquet.hadoop.TestUtils; import org.apache.parquet.hadoop.example.GroupReadSupport; import org.apache.parquet.hadoop.metadata.ParquetMetadata; @@ -122,21 +121,21 @@ public void testWriteStatistics() throws Exception { for(ColumnChunkMetaData cmd: bmd.getColumns()) { switch(cmd.getType()) { case INT32: - assertTrue(intStatsSmall.equals((IntStatistics)cmd.getStatistics())); + TestUtils.assertStatsValuesEqual(intStatsSmall, cmd.getStatistics()); break; case INT64: - assertTrue(longStatsSmall.equals((LongStatistics)cmd.getStatistics())); + TestUtils.assertStatsValuesEqual(longStatsSmall, cmd.getStatistics()); break; case DOUBLE: - assertTrue(doubleStatsSmall.equals((DoubleStatistics)cmd.getStatistics())); + TestUtils.assertStatsValuesEqual(doubleStatsSmall, cmd.getStatistics()); break; case BOOLEAN: - assertTrue(boolStats.equals((BooleanStatistics)cmd.getStatistics())); + TestUtils.assertStatsValuesEqual(boolStats, cmd.getStatistics()); break; case BINARY: // there is also info_string that has no statistics if(cmd.getPath().toString() == "[test_string]") - assertTrue(binaryStatsSmall.equals((BinaryStatistics)cmd.getStatistics())); + TestUtils.assertStatsValuesEqual(binaryStatsSmall, cmd.getStatistics()); break; } } @@ -171,21 +170,21 @@ public void testWriteStatistics() throws Exception { case INT32: // testing the correct limits of an int32, there are also byte and short, tested earlier if(cmd.getPath().toString() == "[test_i32]") - assertTrue(intStatsLarge.equals((IntStatistics)cmd.getStatistics())); + TestUtils.assertStatsValuesEqual(intStatsLarge, cmd.getStatistics()); break; case INT64: - assertTrue(longStatsLarge.equals((LongStatistics)cmd.getStatistics())); + TestUtils.assertStatsValuesEqual(longStatsLarge, cmd.getStatistics()); break; case DOUBLE: - assertTrue(doubleStatsLarge.equals((DoubleStatistics)cmd.getStatistics())); + TestUtils.assertStatsValuesEqual(doubleStatsLarge, cmd.getStatistics()); break; case BOOLEAN: - assertTrue(boolStats.equals((BooleanStatistics)cmd.getStatistics())); + TestUtils.assertStatsValuesEqual(boolStats, cmd.getStatistics()); break; case BINARY: // there is also info_string that has no statistics if(cmd.getPath().toString() == "[test_string]") - assertTrue(binaryStatsLarge.equals((BinaryStatistics)cmd.getStatistics())); + TestUtils.assertStatsValuesEqual(binaryStatsLarge, cmd.getStatistics()); break; } }