Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 4 additions & 46 deletions parquet-cli/src/main/java/org/apache/parquet/cli/Util.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,13 @@
import org.apache.parquet.column.EncodingStats;
import org.apache.parquet.column.statistics.BinaryStatistics;
import org.apache.parquet.column.statistics.BooleanStatistics;
import org.apache.parquet.column.statistics.DoubleStatistics;
import org.apache.parquet.column.statistics.FloatStatistics;
import org.apache.parquet.column.statistics.IntStatistics;
import org.apache.parquet.column.statistics.LongStatistics;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.OriginalType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Type;
import java.nio.charset.StandardCharsets;
import java.util.Locale;
import java.util.Set;

import static org.apache.parquet.column.Encoding.BIT_PACKED;
Expand Down Expand Up @@ -96,34 +91,14 @@ public static String minMaxAsString(Statistics stats, OriginalType annotation) {
return "";
}
// TODO: use original types when showing decimal, timestamp, etc.
if (stats instanceof BooleanStatistics) {
return String.format("%s / %s",
((BooleanStatistics) stats).getMin(),
((BooleanStatistics) stats).getMax());
} else if (stats instanceof IntStatistics) {
return String.format("%d / %d",
((IntStatistics) stats).getMin(),
((IntStatistics) stats).getMax());
} else if (stats instanceof LongStatistics) {
return String.format("%d / %d",
((LongStatistics) stats).getMin(),
((LongStatistics) stats).getMax());
} else if (stats instanceof FloatStatistics) {
return String.format("%f / %f",
((FloatStatistics) stats).getMin(),
((FloatStatistics) stats).getMax());
} else if (stats instanceof DoubleStatistics) {
return String.format("%f / %f",
((DoubleStatistics) stats).getMin(),
((DoubleStatistics) stats).getMax());
} else if (stats instanceof BinaryStatistics) {
if (stats instanceof BinaryStatistics) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is where the other stats types have been removed. Why are only binary statistics supported here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now, I got it. There was another concept I've started with and I've forgot to revert this change. I'll do it in the next commit.
Thanks for the finding.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having a single Statistics class would be nice, if it works out when the comparator interfaces are done.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've designed the new PrimitiveComparator super class to have primitive comparisons to avoid the unnecessary boxing of primitive types. So, I would keep the current specialised Statistics classes for performance reasons.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1

byte[] minBytes = stats.getMinBytes();
byte[] maxBytes = stats.getMaxBytes();
return String.format("%s / %s",
printable(minBytes, annotation == OriginalType.UTF8, 30),
printable(maxBytes, annotation == OriginalType.UTF8, 30));
} else {
throw new RuntimeException("Unknown stats type: " + stats);
return String.format("%s / %s", stats.minAsString(), stats.maxAsString());
}
}

Expand All @@ -134,24 +109,6 @@ public static String toString(Statistics stats, long count, OriginalType annotat
// TODO: use original types when showing decimal, timestamp, etc.
if (stats instanceof BooleanStatistics) {
return String.format("nulls: %d/%d", stats.getNumNulls(), count);
} else if (stats instanceof IntStatistics) {
return String.format("min: %d max: %d nulls: %d/%d",
((IntStatistics) stats).getMin(), ((IntStatistics) stats).getMax(),
stats.getNumNulls(), count);
} else if (stats instanceof LongStatistics) {
return String.format("min: %d max: %d nulls: %d/%d",
((LongStatistics) stats).getMin(), ((LongStatistics) stats).getMax(),
stats.getNumNulls(), count);
} else if (stats instanceof FloatStatistics) {
return String.format("min: %f max: %f nulls: %d/%d",
((FloatStatistics) stats).getMin(),
((FloatStatistics) stats).getMax(),
stats.getNumNulls(), count);
} else if (stats instanceof DoubleStatistics) {
return String.format("min: %f max: %f nulls: %d/%d",
((DoubleStatistics) stats).getMin(),
((DoubleStatistics) stats).getMax(),
stats.getNumNulls(), count);
} else if (stats instanceof BinaryStatistics) {
byte[] minBytes = stats.getMinBytes();
byte[] maxBytes = stats.getMaxBytes();
Expand All @@ -160,7 +117,8 @@ public static String toString(Statistics stats, long count, OriginalType annotat
printable(maxBytes, annotation == OriginalType.UTF8, 30),
stats.getNumNulls(), count);
} else {
throw new RuntimeException("Unknown stats type: " + stats);
return String.format("min: %s max: %s nulls: %d/%d",
stats.minAsString(), stats.maxAsString(), stats.getNumNulls(), count);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
import javax.annotation.Nullable;
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;

import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
Expand Down Expand Up @@ -184,9 +185,11 @@ private class StatsValidator<T extends Comparable<T>> {
private final boolean hasNonNull;
private final T min;
private final T max;
private final Comparator<T> comparator;

public StatsValidator(DataPage page) {
Statistics<T> stats = getStatisticsFromPageHeader(page);
this.comparator = stats.comparator();
this.hasNonNull = stats.hasNonNullValue();
if (hasNonNull) {
this.min = stats.genericGetMin();
Expand All @@ -199,10 +202,10 @@ public StatsValidator(DataPage page) {

public void validate(T value) {
if (hasNonNull) {
if (min.compareTo(value) > 0) {
if (comparator.compare(min, value) > 0) {
throw new BadStatsException("Min should be <= all values.");
}
if (max.compareTo(value) < 0) {
if (comparator.compare(max, value) < 0) {
throw new BadStatsException("Max should be >= all values.");
}
}
Expand Down Expand Up @@ -343,8 +346,8 @@ private void validateStatsForPage(DataPage page, DictionaryPage dict,

console.debug(String.format(
"Validated stats min=%s max=%s nulls=%d for page=%s col=%s",
String.valueOf(stats.genericGetMin()),
String.valueOf(stats.genericGetMax()), stats.getNumNulls(), page,
stats.minAsString(),
stats.maxAsString(), stats.getNumNulls(), page,
Arrays.toString(desc.getPath())));
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@

import java.util.Arrays;

import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.schema.Type;

/**
* Describes a column's type as well as its position in its containing schema.
Expand All @@ -31,8 +33,7 @@
public class ColumnDescriptor implements Comparable<ColumnDescriptor> {

private final String[] path;
private final PrimitiveTypeName type;
private final int typeLength;
private final PrimitiveType type;
private final int maxRep;
private final int maxDef;

Expand All @@ -42,8 +43,10 @@ public class ColumnDescriptor implements Comparable<ColumnDescriptor> {
* @param type the type of the field
* @param maxRep the maximum repetition level for that path
* @param maxDef the maximum definition level for that path
* @deprecated Use {@link #ColumnDescriptor(String[], PrimitiveTypeName, int, int)}
*/
public ColumnDescriptor(String[] path, PrimitiveTypeName type, int maxRep,
@Deprecated
public ColumnDescriptor(String[] path, PrimitiveTypeName type, int maxRep,
int maxDef) {
this(path, type, 0, maxRep, maxDef);
}
Expand All @@ -54,13 +57,23 @@ public ColumnDescriptor(String[] path, PrimitiveTypeName type, int maxRep,
* @param type the type of the field
* @param maxRep the maximum repetition level for that path
* @param maxDef the maximum definition level for that path
* @deprecated Use {@link #ColumnDescriptor(String[], PrimitiveTypeName, int, int)}
*/
public ColumnDescriptor(String[] path, PrimitiveTypeName type,
@Deprecated
public ColumnDescriptor(String[] path, PrimitiveTypeName type,
int typeLength, int maxRep, int maxDef) {
super();
this(path, new PrimitiveType(Type.Repetition.OPTIONAL, type, typeLength,""), maxRep, maxDef);
}

/**
* @param path the path to the leaf field in the schema
* @param type the type of the field
* @param maxRep the maximum repetition level for that path
* @param maxDef the maximum definition level for that path
*/
public ColumnDescriptor(String[] path, PrimitiveType type, int maxRep, int maxDef) {
this.path = path;
this.type = type;
this.typeLength = typeLength;
this.maxRep = maxRep;
this.maxDef = maxDef;
}
Expand Down Expand Up @@ -88,16 +101,27 @@ public int getMaxDefinitionLevel() {

/**
* @return the type of that column
* @deprecated will removed in 2.0.0. Use {@link #getPrimitiveType()} instead.
*/
@Deprecated
public PrimitiveTypeName getType() {
return type;
return type.getPrimitiveTypeName();
}

/**
* @return the size of the type
* @deprecated will removed in 2.0.0. Use {@link #getPrimitiveType()} instead.
**/
@Deprecated
public int getTypeLength() {
return typeLength;
return type.getTypeLength();
}

/**
* @return the primitive type object of the column
*/
public PrimitiveType getPrimitiveType() {
return type;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ private void log(Object value, int r, int d) {
}

private void resetStatistics() {
this.statistics = Statistics.getStatsBasedOnType(this.path.getType());
this.statistics = Statistics.createStats(this.path.getPrimitiveType());
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ private void log(Object value, int r, int d) {
}

private void resetStatistics() {
this.statistics = Statistics.getStatsBasedOnType(this.path.getType());
this.statistics = Statistics.createStats(path.getPrimitiveType());
}

private void definitionLevel(int definitionLevel) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,38 @@
package org.apache.parquet.column.statistics;

import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Types;

public class BinaryStatistics extends Statistics<Binary> {

// A fake type object to be used to generate the proper comparator
private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.BINARY)
.named("fake_binary_type");

private Binary max;
private Binary min;

/**
* @deprecated will be removed in 2.0.0. Use {@link Statistics#createStats(org.apache.parquet.schema.Type)} instead
*/
@Deprecated
public BinaryStatistics() {
this(DEFAULT_FAKE_TYPE);
}

BinaryStatistics(PrimitiveType type) {
super(type);
}

private BinaryStatistics(BinaryStatistics other) {
super(other.type());
if (other.hasNonNullValue()) {
initializeStats(other.min, other.max);
}
setNumNulls(other.getNumNulls());
}

@Override
public void updateStats(Binary value) {
if (!this.hasNonNullValue()) {
Expand Down Expand Up @@ -68,27 +94,23 @@ public byte[] getMinBytes() {
}

@Override
public boolean isSmallerThan(long size) {
return !hasNonNullValue() || ((min.length() + max.length()) < size);
String toString(Binary value) {
// TODO: have separate toString for different logical types?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this should be based on the full type and not just assume the value can be converted to string with UTF-8.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think, it would be better to be implemented in a separate issue because there are some open questions and might lead to bigger effort. (E.g. do we want to implement it for statistics only or for Binary? Should it be similar to the comparators to be retrieved from type or implement in the toString? What about the unsigned integers?)
What do you think?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good. Please open a follow-up issue, then.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Created PARQUET-1170.

return value == null ? "null" : value.toStringUsingUTF8();
}

@Override
public String toString() {
if (this.hasNonNullValue())
return String.format("min: %s, max: %s, num_nulls: %d", min.toStringUsingUTF8(), max.toStringUsingUTF8(), this.getNumNulls());
else if (!this.isEmpty())
return String.format("num_nulls: %d, min/max not defined", this.getNumNulls());
else
return "no stats for this column";
public boolean isSmallerThan(long size) {
return !hasNonNullValue() || ((min.length() + max.length()) < size);
}

/**
* @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0
*/
@Deprecated
public void updateStats(Binary min_value, Binary max_value) {
if (min.compareTo(min_value) > 0) { min = min_value.copy(); }
if (max.compareTo(max_value) < 0) { max = max_value.copy(); }
if (comparator().compare(min, min_value) > 0) { min = min_value.copy(); }
if (comparator().compare(max, max_value) < 0) { max = max_value.copy(); }
}

/**
Expand Down Expand Up @@ -136,4 +158,9 @@ public void setMinMax(Binary min, Binary max) {
this.min = min;
this.markAsNotEmpty();
}

@Override
public BinaryStatistics copy() {
return new BinaryStatistics(this);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docs for this method state that "all the values are copied" but the copy constructor doesn't call Binary.copy(). I don't think it is necessary for it to copy, so you should remove the part about copying values from the docs.

}
}
Loading