Skip to content

Commit c264b50

Browse files
author
Gabor Szadovszky
committed
PARQUET-1217: fix handling of unset nullCount
1 parent 2ec2fb1 commit c264b50

File tree

10 files changed

+163
-48
lines changed

10 files changed

+163
-48
lines changed

parquet-cli/src/main/java/org/apache/parquet/cli/commands/ParquetMetadataCommand.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,12 +168,12 @@ private void printColumnChunk(Logger console, int width, ColumnChunkMetaData col
168168
if (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
169169
console.info(String.format("%-" + width + "s FIXED[%d] %s %-7s %-9d %-8s %-7s %s",
170170
name, type.getTypeLength(), shortCodec(codec), encodingSummary, count,
171-
humanReadable(perValue), stats == null ? "" : String.valueOf(stats.getNumNulls()),
171+
humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()),
172172
minMaxAsString(stats, type.getOriginalType())));
173173
} else {
174174
console.info(String.format("%-" + width + "s %-9s %s %-7s %-9d %-10s %-7s %s",
175175
name, typeName, shortCodec(codec), encodingSummary, count, humanReadable(perValue),
176-
stats == null ? "" : String.valueOf(stats.getNumNulls()),
176+
stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()),
177177
minMaxAsString(stats, type.getOriginalType())));
178178
}
179179
}

parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowPagesCommand.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ public String visit(DataPageV1 page) {
191191
String enc = encodingAsString(page.getValueEncoding(), false);
192192
long totalSize = page.getCompressedSize();
193193
int count = page.getValueCount();
194-
long numNulls = page.getStatistics().getNumNulls();
194+
String numNulls = page.getStatistics().isNumNullsSet() ? Long.toString(page.getStatistics().getNumNulls()) : "";
195195
float perValue = ((float) totalSize) / count;
196196
String minMax = minMaxAsString(page.getStatistics(), type.getOriginalType());
197197
return String.format("%3d-%-3d %-5s %s %-2s %-7d %-10s %-10s %-8s %-7s %s",

parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java

Lines changed: 72 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,44 @@
3535
*/
3636
public abstract class Statistics<T extends Comparable<T>> {
3737

38+
/**
39+
* Builder class to build Statistics objects. Used to read the statistics from the Parquet file.
40+
*/
41+
public static class StatisticsBuilder {
42+
private final PrimitiveType type;
43+
private byte[] min;
44+
private byte[] max;
45+
private long numNulls = -1;
46+
47+
private StatisticsBuilder(PrimitiveType type) {
48+
this.type = type;
49+
}
50+
51+
public StatisticsBuilder withMin(byte[] min) {
52+
this.min = min;
53+
return this;
54+
}
55+
56+
public StatisticsBuilder withMax(byte[] max) {
57+
this.max = max;
58+
return this;
59+
}
60+
61+
public StatisticsBuilder withNumNulls(long numNulls) {
62+
this.numNulls = numNulls;
63+
return this;
64+
}
65+
66+
public Statistics<?> build() {
67+
Statistics<?> stats = createStats(type);
68+
if (min != null && max != null) {
69+
stats.setMinMaxFromBytes(min, max);
70+
}
71+
stats.num_nulls = this.numNulls;
72+
return stats;
73+
}
74+
}
75+
3876
private final PrimitiveType type;
3977
private final PrimitiveComparator<T> comparator;
4078
private boolean hasNonNullValue;
@@ -109,6 +147,17 @@ public static Statistics<?> createStats(Type type) {
109147
}
110148
}
111149

150+
/**
151+
* Returns a builder to create new statistics object. Used to read the statistics form the parquet file.
152+
*
153+
* @param type
154+
* type of the column
155+
* @return builder to create new statistics object
156+
*/
157+
public static StatisticsBuilder getBuilder(PrimitiveType type) {
158+
return new StatisticsBuilder(type);
159+
}
160+
112161
/**
113162
* updates statistics min and max using the passed value
114163
* @param value value to use to update min and max
@@ -217,7 +266,9 @@ public void mergeStatistics(Statistics stats) {
217266
* Abstract method to set min and max values from byte arrays.
218267
* @param minBytes byte array to set the min value to
219268
* @param maxBytes byte array to set the max value to
269+
* @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead.
220270
*/
271+
@Deprecated
221272
abstract public void setMinMaxFromBytes(byte[] minBytes, byte[] maxBytes);
222273

223274
/**
@@ -310,9 +361,13 @@ public String maxAsString() {
310361

311362
@Override
312363
public String toString() {
313-
if (this.hasNonNullValue())
314-
return String.format("min: %s, max: %s, num_nulls: %d", minAsString(), maxAsString(), this.getNumNulls());
315-
else if (!this.isEmpty())
364+
if (this.hasNonNullValue()) {
365+
if (isNumNullsSet()) {
366+
return String.format("min: %s, max: %s, num_nulls: %d", minAsString(), maxAsString(), this.getNumNulls());
367+
} else {
368+
return String.format("min: %s, max: %s, num_nulls not defined", minAsString(), maxAsString());
369+
}
370+
} else if (!this.isEmpty())
316371
return String.format("num_nulls: %d, min/max not defined", this.getNumNulls());
317372
else
318373
return "no stats for this column";
@@ -335,16 +390,20 @@ public void incrementNumNulls(long increment) {
335390

336391
/**
337392
* Returns the null count
338-
* @return null count
393+
* @return null count or {@code -1} if the null count is not set
339394
*/
340395
public long getNumNulls() {
341396
return num_nulls;
342397
}
343398

344399
/**
345400
* Sets the number of nulls to the parameter value
346-
* @param nulls null count to set the count to
401+
*
402+
* @param nulls
403+
* null count to set the count to
404+
* @deprecated will be removed in 2.0.0. Use {@link #getBuilder(PrimitiveType)} instead.
347405
*/
406+
@Deprecated
348407
public void setNumNulls(long nulls) {
349408
num_nulls = nulls;
350409
}
@@ -355,7 +414,7 @@ public void setNumNulls(long nulls) {
355414
* @return true if object is empty, false otherwise
356415
*/
357416
public boolean isEmpty() {
358-
return !hasNonNullValue && num_nulls == 0;
417+
return !hasNonNullValue && !isNumNullsSet();
359418
}
360419

361420
/**
@@ -365,6 +424,13 @@ public boolean hasNonNullValue() {
365424
return hasNonNullValue;
366425
}
367426

427+
/**
428+
* @return whether numNulls is set and can be used
429+
*/
430+
public boolean isNumNullsSet() {
431+
return num_nulls >= 0;
432+
}
433+
368434
/**
369435
* Sets the page/column as having a valid non-null value
370436
* kind of misnomer here

parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ public class TestStatistics {
4242
@Test
4343
public void testNumNulls() {
4444
IntStatistics stats = new IntStatistics();
45+
assertTrue(stats.isNumNullsSet());
4546
assertEquals(stats.getNumNulls(), 0);
4647

4748
stats.incrementNumNulls();

parquet-hadoop/src/main/java/org/apache/parquet/filter2/statisticslevel/StatisticsFilter.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,10 @@ public <T extends Comparable<T>> Boolean visit(Eq<T> eq) {
121121
}
122122

123123
if (value == null) {
124+
// We don't know anything about the nulls in this chunk
125+
if (!stats.isNumNullsSet()) {
126+
return BLOCK_MIGHT_MATCH;
127+
}
124128
// we are looking for records where v eq(null)
125129
// so drop if there are no nulls in this chunk
126130
return !hasNulls(meta);
@@ -170,7 +174,7 @@ public <T extends Comparable<T>> Boolean visit(NotEq<T> notEq) {
170174
return isAllNulls(meta);
171175
}
172176

173-
if (hasNulls(meta)) {
177+
if (stats.isNumNullsSet() && hasNulls(meta)) {
174178
// we are looking for records where v notEq(someNonNull)
175179
// but this chunk contains nulls, we cannot drop it
176180
return BLOCK_MIGHT_MATCH;

parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
6565
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
6666
import org.apache.parquet.column.EncodingStats;
67+
import org.apache.parquet.column.statistics.Statistics.StatisticsBuilder;
6768
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
6869
import org.apache.parquet.io.ParquetDecodingException;
6970
import org.apache.parquet.schema.ColumnOrder.ColumnOrderName;
@@ -401,17 +402,20 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist
401402
static org.apache.parquet.column.statistics.Statistics fromParquetStatisticsInternal
402403
(String createdBy, Statistics formatStats, PrimitiveType type, SortOrder typeSortOrder) {
403404
// create stats object based on the column type
404-
org.apache.parquet.column.statistics.Statistics stats = org.apache.parquet.column.statistics.Statistics.createStats(type);
405+
StatisticsBuilder builder = org.apache.parquet.column.statistics.Statistics.getBuilder(type);
405406

406407
if (formatStats != null) {
407408
// Use the new V2 min-max statistics over the former one if it is filled
408409
if (formatStats.isSetMin_value() && formatStats.isSetMax_value()) {
409410
byte[] min = formatStats.min_value.array();
410411
byte[] max = formatStats.max_value.array();
411412
if (isMinMaxStatsSupported(type) || Arrays.equals(min, max)) {
412-
stats.setMinMaxFromBytes(min, max);
413+
builder.withMin(min);
414+
builder.withMax(max);
415+
}
416+
if (formatStats.isSetNull_count()) {
417+
builder.withNumNulls(formatStats.null_count);
413418
}
414-
stats.setNumNulls(formatStats.null_count);
415419
} else {
416420
boolean isSet = formatStats.isSetMax() && formatStats.isSetMin();
417421
boolean maxEqualsMin = isSet ? Arrays.equals(formatStats.getMin(), formatStats.getMax()) : false;
@@ -424,13 +428,16 @@ public static org.apache.parquet.column.statistics.Statistics fromParquetStatist
424428
if (!CorruptStatistics.shouldIgnoreStatistics(createdBy, type.getPrimitiveTypeName()) &&
425429
(sortOrdersMatch || maxEqualsMin)) {
426430
if (isSet) {
427-
stats.setMinMaxFromBytes(formatStats.min.array(), formatStats.max.array());
431+
builder.withMin(formatStats.min.array());
432+
builder.withMax(formatStats.max.array());
433+
}
434+
if (formatStats.isSetNull_count()) {
435+
builder.withNumNulls(formatStats.null_count);
428436
}
429-
stats.setNumNulls(formatStats.null_count);
430437
}
431438
}
432439
}
433-
return stats;
440+
return builder.build();
434441
}
435442

436443
public org.apache.parquet.column.statistics.Statistics fromParquetStatistics(

parquet-hadoop/src/test/java/org/apache/parquet/filter2/statisticslevel/TestStatisticsFilter.java

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
3939
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
4040
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
41+
import org.apache.parquet.schema.Types;
4142

4243
import static org.apache.parquet.filter2.predicate.FilterApi.binaryColumn;
4344
import static org.apache.parquet.io.api.Binary.fromString;
@@ -61,7 +62,8 @@
6162

6263
public class TestStatisticsFilter {
6364

64-
private static ColumnChunkMetaData getIntColumnMeta(IntStatistics stats, long valueCount) {
65+
private static ColumnChunkMetaData getIntColumnMeta(org.apache.parquet.column.statistics.Statistics<?> stats,
66+
long valueCount) {
6567
return ColumnChunkMetaData.get(ColumnPath.get("int", "column"),
6668
PrimitiveTypeName.INT32,
6769
CompressionCodecName.GZIP,
@@ -70,7 +72,8 @@ private static ColumnChunkMetaData getIntColumnMeta(IntStatistics stats, long va
7072
0L, 0L, valueCount, 0L, 0L);
7173
}
7274

73-
private static ColumnChunkMetaData getDoubleColumnMeta(DoubleStatistics stats, long valueCount) {
75+
private static ColumnChunkMetaData getDoubleColumnMeta(org.apache.parquet.column.statistics.Statistics<?> stats,
76+
long valueCount) {
7477
return ColumnChunkMetaData.get(ColumnPath.get("double", "column"),
7578
PrimitiveTypeName.DOUBLE,
7679
CompressionCodecName.GZIP,
@@ -86,18 +89,17 @@ private static ColumnChunkMetaData getDoubleColumnMeta(DoubleStatistics stats, l
8689

8790
private static final IntStatistics intStats = new IntStatistics();
8891
private static final IntStatistics nullIntStats = new IntStatistics();
89-
private static final IntStatistics missingMinMaxIntStats = new IntStatistics();
92+
private static final org.apache.parquet.column.statistics.Statistics<?> missingMinMaxIntStats = org.apache.parquet.column.statistics.Statistics
93+
.getBuilder(Types.required(PrimitiveTypeName.INT32).named("test_int32")).build();
9094
private static final DoubleStatistics doubleStats = new DoubleStatistics();
91-
private static final DoubleStatistics missingMinMaxDoubleStats = new DoubleStatistics();
95+
private static final org.apache.parquet.column.statistics.Statistics<?> missingMinMaxDoubleStats = org.apache.parquet.column.statistics.Statistics
96+
.getBuilder(Types.required(PrimitiveTypeName.DOUBLE).named("test_double")).withNumNulls(100).build();
9297

9398
static {
9499
intStats.setMinMax(10, 100);
95100
doubleStats.setMinMax(10, 100);
96101

97-
nullIntStats.setMinMax(0, 0);
98102
nullIntStats.setNumNulls(177);
99-
100-
missingMinMaxDoubleStats.setNumNulls(100);
101103
}
102104

103105
private static final List<ColumnChunkMetaData> columnMetas = Arrays.asList(

parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -658,6 +658,7 @@ private void testUseStatsWithSignedSortOrder(StatsHelper helper) {
658658
binaryType);
659659

660660
Assert.assertFalse("Stats should not be empty", convertedStats.isEmpty());
661+
Assert.assertTrue(convertedStats.isNumNullsSet());
661662
Assert.assertEquals("Should have 3 nulls", 3, convertedStats.getNumNulls());
662663
if (helper == StatsHelper.V1) {
663664
assertFalse("Min-max should be null for V1 stats", convertedStats.hasNonNullValue());
@@ -669,6 +670,38 @@ private void testUseStatsWithSignedSortOrder(StatsHelper helper) {
669670
}
670671
}
671672

673+
@Test
674+
public void testMissingValuesFromStats() {
675+
ParquetMetadataConverter converter = new ParquetMetadataConverter();
676+
PrimitiveType type = Types.required(PrimitiveTypeName.INT32).named("test_int32");
677+
678+
org.apache.parquet.format.Statistics formatStats = new org.apache.parquet.format.Statistics();
679+
Statistics<?> stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type);
680+
assertFalse(stats.isNumNullsSet());
681+
assertFalse(stats.hasNonNullValue());
682+
assertTrue(stats.isEmpty());
683+
assertEquals(-1, stats.getNumNulls());
684+
685+
formatStats.clear();
686+
formatStats.setMin(BytesUtils.intToBytes(-100));
687+
formatStats.setMax(BytesUtils.intToBytes(100));
688+
stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type);
689+
assertFalse(stats.isNumNullsSet());
690+
assertTrue(stats.hasNonNullValue());
691+
assertFalse(stats.isEmpty());
692+
assertEquals(-1, stats.getNumNulls());
693+
assertEquals(-100, stats.genericGetMin());
694+
assertEquals(100, stats.genericGetMax());
695+
696+
formatStats.clear();
697+
formatStats.setNull_count(2000);
698+
stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type);
699+
assertTrue(stats.isNumNullsSet());
700+
assertFalse(stats.hasNonNullValue());
701+
assertFalse(stats.isEmpty());
702+
assertEquals(2000, stats.getNumNulls());
703+
}
704+
672705
@Test
673706
public void testSkippedV2Stats() {
674707
testSkippedV2Stats(

parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
6161
import org.apache.parquet.schema.MessageType;
6262
import org.apache.parquet.schema.MessageTypeParser;
63+
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
6364
import org.apache.parquet.schema.Types;
6465
import org.apache.parquet.bytes.HeapByteBufferAllocator;
6566

@@ -92,7 +93,8 @@ public void test() throws Exception {
9293
int v = 3;
9394
BytesInput definitionLevels = BytesInput.fromInt(d);
9495
BytesInput repetitionLevels = BytesInput.fromInt(r);
95-
Statistics<?> statistics = new BinaryStatistics();
96+
Statistics<?> statistics = Statistics.getBuilder(Types.required(PrimitiveTypeName.BINARY).named("test_binary"))
97+
.build();
9698
BytesInput data = BytesInput.fromInt(v);
9799
int rowCount = 5;
98100
int nullCount = 1;

0 commit comments

Comments
 (0)