-
Notifications
You must be signed in to change notification settings - Fork 1.5k
PARQUET-1025: Support new min-max statistics in parquet-mr #435
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e1719bb
3378b6d
52cd58f
20b937f
51bc1f8
688ef2e
318e585
c5536a0
2f28c2c
95199e5
70e56a7
bc86e8a
a2ae97c
524750b
dc838f2
820df6f
2a63fcf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,12 +19,38 @@ | |
| package org.apache.parquet.column.statistics; | ||
|
|
||
| import org.apache.parquet.io.api.Binary; | ||
| import org.apache.parquet.schema.PrimitiveType; | ||
| import org.apache.parquet.schema.Types; | ||
|
|
||
| public class BinaryStatistics extends Statistics<Binary> { | ||
|
|
||
| // A fake type object to be used to generate the proper comparator | ||
| private static final PrimitiveType DEFAULT_FAKE_TYPE = Types.optional(PrimitiveType.PrimitiveTypeName.BINARY) | ||
| .named("fake_binary_type"); | ||
|
|
||
| private Binary max; | ||
| private Binary min; | ||
|
|
||
| /** | ||
| * @deprecated will be removed in 2.0.0. Use {@link Statistics#createStats(org.apache.parquet.schema.Type)} instead | ||
| */ | ||
| @Deprecated | ||
| public BinaryStatistics() { | ||
| this(DEFAULT_FAKE_TYPE); | ||
| } | ||
|
|
||
| BinaryStatistics(PrimitiveType type) { | ||
| super(type); | ||
| } | ||
|
|
||
| private BinaryStatistics(BinaryStatistics other) { | ||
| super(other.type()); | ||
| if (other.hasNonNullValue()) { | ||
| initializeStats(other.min, other.max); | ||
| } | ||
| setNumNulls(other.getNumNulls()); | ||
| } | ||
|
|
||
| @Override | ||
| public void updateStats(Binary value) { | ||
| if (!this.hasNonNullValue()) { | ||
|
|
@@ -68,27 +94,23 @@ public byte[] getMinBytes() { | |
| } | ||
|
|
||
| @Override | ||
| public boolean isSmallerThan(long size) { | ||
| return !hasNonNullValue() || ((min.length() + max.length()) < size); | ||
| String toString(Binary value) { | ||
| // TODO: have separate toString for different logical types? | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this should be based on the full type and not just assume the value can be converted to string with UTF-8.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think, it would be better to be implemented in a separate issue because there are some open questions and might lead to bigger effort. (E.g. do we want to implement it for statistics only or for Binary? Should it be similar to the comparators to be retrieved from type or implement in the toString? What about the unsigned integers?)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds good. Please open a follow-up issue, then.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Created PARQUET-1170. |
||
| return value == null ? "null" : value.toStringUsingUTF8(); | ||
| } | ||
|
|
||
| @Override | ||
| public String toString() { | ||
| if (this.hasNonNullValue()) | ||
| return String.format("min: %s, max: %s, num_nulls: %d", min.toStringUsingUTF8(), max.toStringUsingUTF8(), this.getNumNulls()); | ||
| else if (!this.isEmpty()) | ||
| return String.format("num_nulls: %d, min/max not defined", this.getNumNulls()); | ||
| else | ||
| return "no stats for this column"; | ||
| public boolean isSmallerThan(long size) { | ||
| return !hasNonNullValue() || ((min.length() + max.length()) < size); | ||
| } | ||
|
|
||
| /** | ||
| * @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0 | ||
| */ | ||
| @Deprecated | ||
| public void updateStats(Binary min_value, Binary max_value) { | ||
| if (min.compareTo(min_value) > 0) { min = min_value.copy(); } | ||
| if (max.compareTo(max_value) < 0) { max = max_value.copy(); } | ||
| if (comparator().compare(min, min_value) > 0) { min = min_value.copy(); } | ||
| if (comparator().compare(max, max_value) < 0) { max = max_value.copy(); } | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -136,4 +158,9 @@ public void setMinMax(Binary min, Binary max) { | |
| this.min = min; | ||
| this.markAsNotEmpty(); | ||
| } | ||
|
|
||
| @Override | ||
| public BinaryStatistics copy() { | ||
| return new BinaryStatistics(this); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The docs for this method state that "all the values are copied" but the copy constructor doesn't call |
||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is where the other stats types have been removed. Why are only binary statistics supported here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now, I got it. There was another concept I've started with and I've forgot to revert this change. I'll do it in the next commit.
Thanks for the finding.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Having a single Statistics class would be nice, if it works out when the comparator interfaces are done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've designed the new
PrimitiveComparatorsuper class to have primitive comparisons to avoid the unnecessary boxing of primitive types. So, I would keep the current specialisedStatisticsclasses for performance reasons.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1