Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ public byte[] getMinBytes() {
return min == null ? null : min.getBytes();
}

@Override
public boolean isSmallerThan(long size) {
return !hasNonNullValue() || ((min.length() + max.length()) < size);
}

@Override
public String toString() {
if (this.hasNonNullValue())
Expand All @@ -77,11 +82,19 @@ else if (!this.isEmpty())
return "no stats for this column";
}

/**
* @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0
*/
@Deprecated
public void updateStats(Binary min_value, Binary max_value) {
if (min.compareTo(min_value) > 0) { min = min_value.copy(); }
if (max.compareTo(max_value) < 0) { max = max_value.copy(); }
}

/**
* @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0
*/
@Deprecated
public void initializeStats(Binary min_value, Binary max_value) {
min = min_value.copy();
max = max_value.copy();
Expand All @@ -98,14 +111,26 @@ public Binary genericGetMax() {
return max;
}

/**
* @deprecated use {@link #genericGetMax()}, will be removed in 2.0.0
*/
@Deprecated
public Binary getMax() {
return max;
}

/**
* @deprecated use {@link #genericGetMin()}, will be removed in 2.0.0
*/
@Deprecated
public Binary getMin() {
return min;
}

/**
* @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0
*/
@Deprecated
public void setMinMax(Binary min, Binary max) {
this.max = max;
this.min = min;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ public byte[] getMinBytes() {
return BytesUtils.booleanToBytes(min);
}

@Override
public boolean isSmallerThan(long size) {
return !hasNonNullValue() || (2 < size);
}

@Override
public String toString() {
if (this.hasNonNullValue())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ public byte[] getMinBytes() {
return BytesUtils.longToBytes(Double.doubleToLongBits(min));
}

@Override
public boolean isSmallerThan(long size) {
return !hasNonNullValue() || (16 < size);
}

@Override
public String toString() {
if(this.hasNonNullValue())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ public byte[] getMinBytes() {
return BytesUtils.intToBytes(Float.floatToIntBits(min));
}

@Override
public boolean isSmallerThan(long size) {
return !hasNonNullValue() || (8 < size);
}

@Override
public String toString() {
if (this.hasNonNullValue())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ public byte[] getMinBytes() {
return BytesUtils.intToBytes(min);
}

@Override
public boolean isSmallerThan(long size) {
return !hasNonNullValue() || (8 < size);
}

@Override
public String toString() {
if (this.hasNonNullValue())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ public byte[] getMinBytes() {
return BytesUtils.longToBytes(min);
}

@Override
public boolean isSmallerThan(long size) {
return !hasNonNullValue() || (16 < size);
}

@Override
public String toString() {
if (this.hasNonNullValue())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,14 @@ public void mergeStatistics(Statistics stats) {
*/
abstract public byte[] getMinBytes();

/**
* Abstract method to return whether the min and max values fit in the given
* size.
* @param size a size in bytes
* @return true iff the min and max values are less than size bytes
*/
abstract public boolean isSmallerThan(long size);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

size() might be a more intuitive approach and would remove the burden of evaluating this check in the subclasses.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's been a while, but I think my logic was this:

  • getMinBytes and getMaxBytes allocate byte arrays and generally too much work for this check. We could cache the bytes, but that requires more work.
  • size isn't determined by Statistics because the metadata converter is responsible for actually converting stats into bytes. This only assumes that the entire min and max bytes will be written.

So we delegate to a stats object to avoid work, but can't delegate to stats because it doesn't do serialization. I could change it to getSizeHint, but I don't think that's much better and I'm inclined to leave it as is.


/**
* toString() to display min, max, num_nulls in a string
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ public class ParquetMetadataConverter {

public static final MetadataFilter NO_FILTER = new NoFilter();
public static final MetadataFilter SKIP_ROW_GROUPS = new SkipMetadataFilter();
public static final long MAX_STATS_SIZE = 4096; // limit stats to 4k

private static final Log LOG = Log.getLog(ParquetMetadataConverter.class);

Expand Down Expand Up @@ -284,7 +285,11 @@ dataPageType, getEncoding(encoding),
public static Statistics toParquetStatistics(
org.apache.parquet.column.statistics.Statistics statistics) {
Statistics stats = new Statistics();
if (!statistics.isEmpty()) {
// Don't write stats larger than the max size rather than truncating. The
// rationale is that some engines may use the minimum value in the page as
// the true minimum for aggregations and there is no way to mark that a
// value has been truncated and is a lower bound and not in the page.
if (!statistics.isEmpty() && statistics.isSmallerThan(MAX_STATS_SIZE)) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to have null count without min/max. This is a nit, but engines could use null count even without min/max present.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just looked into this. There's nothing preventing us from doing this in the format, but the implementation assumes that there will be non-null min and max if numNulls != numRows. I think it makes sense to add this as a follow-up instead of making this patch larger.

stats.setNull_count(statistics.getNumNulls());
if (statistics.hasNonNullValue()) {
stats.setMax(statistics.getMaxBytes());
Expand All @@ -293,6 +298,7 @@ public static Statistics toParquetStatistics(
}
return stats;
}

/**
* @deprecated Replaced by {@link #fromParquetStatistics(
* String createdBy, Statistics statistics, PrimitiveTypeName type)}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,21 @@
import java.util.TreeSet;

import com.google.common.collect.Sets;
import org.apache.parquet.Version;
import org.apache.parquet.bytes.BytesUtils;
import org.apache.parquet.column.statistics.BinaryStatistics;
import org.apache.parquet.column.statistics.BooleanStatistics;
import org.apache.parquet.column.statistics.DoubleStatistics;
import org.apache.parquet.column.statistics.FloatStatistics;
import org.apache.parquet.column.statistics.IntStatistics;
import org.apache.parquet.column.statistics.LongStatistics;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.io.api.Binary;
import org.junit.Assert;
import org.junit.Test;

Expand Down Expand Up @@ -357,5 +366,153 @@ public void testEncodingsCache() {
assertEquals("java.util.Collections$UnmodifiableSet", res1.getClass().getName());
assertEquals("java.util.Collections$UnmodifiableSet", res2.getClass().getName());
assertEquals("java.util.Collections$UnmodifiableSet", res3.getClass().getName());
}
}

@Test
public void testBinaryStats() {
// make fake stats and verify the size check
BinaryStatistics stats = new BinaryStatistics();
stats.incrementNumNulls(3004);
byte[] min = new byte[904];
byte[] max = new byte[2388];
stats.updateStats(Binary.fromConstantByteArray(min));
stats.updateStats(Binary.fromConstantByteArray(max));
long totalLen = min.length + max.length;
Assert.assertFalse("Should not be smaller than min + max size",
stats.isSmallerThan(totalLen));
Assert.assertTrue("Should be smaller than min + max size + 1",
stats.isSmallerThan(totalLen + 1));

org.apache.parquet.format.Statistics formatStats =
ParquetMetadataConverter.toParquetStatistics(stats);

Assert.assertArrayEquals("Min should match", min, formatStats.getMin());
Assert.assertArrayEquals("Max should match", max, formatStats.getMax());
Assert.assertEquals("Num nulls should match",
3004, formatStats.getNull_count());

// convert to empty stats because the values are too large
stats.setMinMaxFromBytes(max, max);

formatStats = ParquetMetadataConverter.toParquetStatistics(stats);

Assert.assertFalse("Min should not be set", formatStats.isSetMin());
Assert.assertFalse("Max should not be set", formatStats.isSetMax());
Assert.assertFalse("Num nulls should not be set",
formatStats.isSetNull_count());

Statistics roundTripStats = ParquetMetadataConverter.fromParquetStatistics(
Version.FULL_VERSION, formatStats, PrimitiveTypeName.BINARY);

Assert.assertTrue(roundTripStats.isEmpty());
}

@Test
public void testIntegerStats() {
// make fake stats and verify the size check
IntStatistics stats = new IntStatistics();
stats.incrementNumNulls(3004);
int min = Integer.MIN_VALUE;
int max = Integer.MAX_VALUE;
stats.updateStats(min);
stats.updateStats(max);

org.apache.parquet.format.Statistics formatStats =
ParquetMetadataConverter.toParquetStatistics(stats);

Assert.assertEquals("Min should match",
min, BytesUtils.bytesToInt(formatStats.getMin()));
Assert.assertEquals("Max should match",
max, BytesUtils.bytesToInt(formatStats.getMax()));
Assert.assertEquals("Num nulls should match",
3004, formatStats.getNull_count());
}

@Test
public void testLongStats() {
// make fake stats and verify the size check
LongStatistics stats = new LongStatistics();
stats.incrementNumNulls(3004);
long min = Long.MIN_VALUE;
long max = Long.MAX_VALUE;
stats.updateStats(min);
stats.updateStats(max);

org.apache.parquet.format.Statistics formatStats =
ParquetMetadataConverter.toParquetStatistics(stats);

Assert.assertEquals("Min should match",
min, BytesUtils.bytesToLong(formatStats.getMin()));
Assert.assertEquals("Max should match",
max, BytesUtils.bytesToLong(formatStats.getMax()));
Assert.assertEquals("Num nulls should match",
3004, formatStats.getNull_count());
}

@Test
public void testFloatStats() {
// make fake stats and verify the size check
FloatStatistics stats = new FloatStatistics();
stats.incrementNumNulls(3004);
float min = Float.MIN_VALUE;
float max = Float.MAX_VALUE;
stats.updateStats(min);
stats.updateStats(max);

org.apache.parquet.format.Statistics formatStats =
ParquetMetadataConverter.toParquetStatistics(stats);

Assert.assertEquals("Min should match",
min, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMin())),
0.000001);
Assert.assertEquals("Max should match",
max, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMax())),
0.000001);
Assert.assertEquals("Num nulls should match",
3004, formatStats.getNull_count());
}

@Test
public void testDoubleStats() {
// make fake stats and verify the size check
DoubleStatistics stats = new DoubleStatistics();
stats.incrementNumNulls(3004);
double min = Double.MIN_VALUE;
double max = Double.MAX_VALUE;
stats.updateStats(min);
stats.updateStats(max);

org.apache.parquet.format.Statistics formatStats =
ParquetMetadataConverter.toParquetStatistics(stats);

Assert.assertEquals("Min should match",
min, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMin())),
0.000001);
Assert.assertEquals("Max should match",
max, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMax())),
0.000001);
Assert.assertEquals("Num nulls should match",
3004, formatStats.getNull_count());
}

@Test
public void testBooleanStats() {
// make fake stats and verify the size check
BooleanStatistics stats = new BooleanStatistics();
stats.incrementNumNulls(3004);
boolean min = Boolean.FALSE;
boolean max = Boolean.TRUE;
stats.updateStats(min);
stats.updateStats(max);

org.apache.parquet.format.Statistics formatStats =
ParquetMetadataConverter.toParquetStatistics(stats);

Assert.assertEquals("Min should match",
min, BytesUtils.bytesToBool(formatStats.getMin()));
Assert.assertEquals("Max should match",
max, BytesUtils.bytesToBool(formatStats.getMax()));
Assert.assertEquals("Num nulls should match",
3004, formatStats.getNull_count());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,15 @@ private void validateStatsForPage(DataPage page, DictionaryPage dict, ColumnDesc
PrimitiveConverter converter = getValidatingConverter(page, desc.getType());
Statistics stats = getStatisticsFromPageHeader(page);

if (stats.isEmpty()) {
// stats are empty if num nulls = 0 and there are no non-null values
// this happens if stats are not written (e.g., when stats are too big)
System.err.println(String.format(
"No stats written for page=%s col=%s",
page, Arrays.toString(desc.getPath())));
return;
}

long numNulls = 0;
ColumnReaderImpl column = new ColumnReaderImpl(desc, reader, converter, null);
for (int i = 0; i < reader.getTotalValueCount(); i += 1) {
Expand Down