Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion api/src/main/java/org/apache/iceberg/ContentFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ public interface ContentFile<F> {
*/
Map<Integer, Long> nullValueCounts();

/**
* Returns if collected, map from column ID to its NaN value count, null otherwise.
*/
Map<Integer, Long> nanValueCounts();

/**
* Returns if collected, map from column ID to value lower bounds, null otherwise.
*/
Expand Down Expand Up @@ -132,7 +137,8 @@ public interface ContentFile<F> {
* Copies this file without file stats. Manifest readers can reuse file instances; use
* this method to copy data without stats when collecting files.
*
* @return a copy of this data file, without lower bounds, upper bounds, value counts, or null value counts
* @return a copy of this data file, without lower bounds, upper bounds, value counts,
* null value counts, or nan value counts
*/
F copyWithoutStats();
}
6 changes: 5 additions & 1 deletion api/src/main/java/org/apache/iceberg/DataFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ public interface DataFile extends ContentFile<DataFile> {
IntegerType.get(), LongType.get()), "Map of column id to total count, including null and NaN");
Types.NestedField NULL_VALUE_COUNTS = optional(110, "null_value_counts", MapType.ofRequired(121, 122,
IntegerType.get(), LongType.get()), "Map of column id to null value count");
Types.NestedField NAN_VALUE_COUNTS = optional(137, "nan_value_counts", MapType.ofRequired(138, 139,
IntegerType.get(), LongType.get()), "Map of column id to number of NaN values in the column");
Types.NestedField LOWER_BOUNDS = optional(125, "lower_bounds", MapType.ofRequired(126, 127,
IntegerType.get(), BinaryType.get()), "Map of column id to lower bound");
Types.NestedField UPPER_BOUNDS = optional(128, "upper_bounds", MapType.ofRequired(129, 130,
Expand All @@ -59,10 +61,11 @@ public interface DataFile extends ContentFile<DataFile> {
"Splittable offsets");
Types.NestedField EQUALITY_IDS = optional(135, "equality_ids", ListType.ofRequired(136, IntegerType.get()),
"Equality comparison field IDs");

int PARTITION_ID = 102;
String PARTITION_NAME = "partition";
String PARTITION_DOC = "Partition data tuple, schema based on the partition spec";
// NEXT ID TO ASSIGN: 137
// NEXT ID TO ASSIGN: 140

static StructType getType(StructType partitionType) {
// IDs start at 100 to leave room for changes to ManifestEntry
Expand All @@ -76,6 +79,7 @@ static StructType getType(StructType partitionType) {
COLUMN_SIZES,
VALUE_COUNTS,
NULL_VALUE_COUNTS,
NAN_VALUE_COUNTS,
LOWER_BOUNDS,
UPPER_BOUNDS,
KEY_METADATA,
Expand Down
5 changes: 5 additions & 0 deletions api/src/test/java/org/apache/iceberg/TestHelpers.java
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,11 @@ public Map<Integer, Long> nullValueCounts() {
return nullValueCounts;
}

@Override
public Map<Integer, Long> nanValueCounts() {
return null; // will be updated in a separate pr soon
}

@Override
public Map<Integer, ByteBuffer> lowerBounds() {
return lowerBounds;
Expand Down
38 changes: 27 additions & 11 deletions core/src/main/java/org/apache/iceberg/BaseFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ public PartitionData copy() {
private Map<Integer, Long> columnSizes = null;
private Map<Integer, Long> valueCounts = null;
private Map<Integer, Long> nullValueCounts = null;
private Map<Integer, Long> nanValueCounts = null;
private Map<Integer, ByteBuffer> lowerBounds = null;
private Map<Integer, ByteBuffer> upperBounds = null;
private long[] splitOffsets = null;
Expand Down Expand Up @@ -117,7 +118,8 @@ public PartitionData copy() {

BaseFile(int specId, FileContent content, String filePath, FileFormat format,
PartitionData partition, long fileSizeInBytes, long recordCount,
Map<Integer, Long> columnSizes, Map<Integer, Long> valueCounts, Map<Integer, Long> nullValueCounts,
Map<Integer, Long> columnSizes, Map<Integer, Long> valueCounts,
Map<Integer, Long> nullValueCounts, Map<Integer, Long> nanValueCounts,
Map<Integer, ByteBuffer> lowerBounds, Map<Integer, ByteBuffer> upperBounds, List<Long> splitOffsets,
int[] equalityFieldIds, ByteBuffer keyMetadata) {
this.partitionSpecId = specId;
Expand All @@ -140,6 +142,7 @@ public PartitionData copy() {
this.columnSizes = columnSizes;
this.valueCounts = valueCounts;
this.nullValueCounts = nullValueCounts;
this.nanValueCounts = nanValueCounts;
this.lowerBounds = SerializableByteBufferMap.wrap(lowerBounds);
this.upperBounds = SerializableByteBufferMap.wrap(upperBounds);
this.splitOffsets = ArrayUtil.toLongArray(splitOffsets);
Expand Down Expand Up @@ -168,12 +171,14 @@ public PartitionData copy() {
this.columnSizes = copy(toCopy.columnSizes);
this.valueCounts = copy(toCopy.valueCounts);
this.nullValueCounts = copy(toCopy.nullValueCounts);
this.nanValueCounts = copy(toCopy.nanValueCounts);
this.lowerBounds = SerializableByteBufferMap.wrap(copy(toCopy.lowerBounds));
this.upperBounds = SerializableByteBufferMap.wrap(copy(toCopy.upperBounds));
} else {
this.columnSizes = null;
this.valueCounts = null;
this.nullValueCounts = null;
this.nanValueCounts = null;
this.lowerBounds = null;
this.upperBounds = null;
}
Expand Down Expand Up @@ -247,21 +252,24 @@ public void put(int i, Object value) {
this.nullValueCounts = (Map<Integer, Long>) value;
return;
case 9:
this.lowerBounds = SerializableByteBufferMap.wrap((Map<Integer, ByteBuffer>) value);
this.nanValueCounts = (Map<Integer, Long>) value;
return;
case 10:
this.upperBounds = SerializableByteBufferMap.wrap((Map<Integer, ByteBuffer>) value);
this.lowerBounds = SerializableByteBufferMap.wrap((Map<Integer, ByteBuffer>) value);
return;
case 11:
this.keyMetadata = ByteBuffers.toByteArray((ByteBuffer) value);
this.upperBounds = SerializableByteBufferMap.wrap((Map<Integer, ByteBuffer>) value);
return;
case 12:
this.splitOffsets = ArrayUtil.toLongArray((List<Long>) value);
this.keyMetadata = ByteBuffers.toByteArray((ByteBuffer) value);
return;
case 13:
this.equalityIds = ArrayUtil.toIntArray((List<Integer>) value);
this.splitOffsets = ArrayUtil.toLongArray((List<Long>) value);
return;
case 14:
this.equalityIds = ArrayUtil.toIntArray((List<Integer>) value);
return;
case 15:
this.fileOrdinal = (long) value;
return;
default:
Expand Down Expand Up @@ -301,16 +309,18 @@ public Object get(int i) {
case 8:
return nullValueCounts;
case 9:
return lowerBounds;
return nanValueCounts;
case 10:
return upperBounds;
return lowerBounds;
case 11:
return keyMetadata();
return upperBounds;
case 12:
return splitOffsets();
return keyMetadata();
case 13:
return equalityFieldIds();
return splitOffsets();
case 14:
return equalityFieldIds();
case 15:
return pos;
default:
throw new UnsupportedOperationException("Unknown field ordinal: " + pos);
Expand Down Expand Up @@ -377,6 +387,11 @@ public Map<Integer, Long> nullValueCounts() {
return nullValueCounts;
}

@Override
public Map<Integer, Long> nanValueCounts() {
return nanValueCounts;
}

@Override
public Map<Integer, ByteBuffer> lowerBounds() {
return lowerBounds;
Expand Down Expand Up @@ -423,6 +438,7 @@ public String toString() {
.add("column_sizes", columnSizes)
.add("value_counts", valueCounts)
.add("null_value_counts", nullValueCounts)
.add("nan_value_counts", nanValueCounts)
.add("lower_bounds", lowerBounds)
.add("upper_bounds", upperBounds)
.add("key_metadata", keyMetadata == null ? "null" : "(redacted)")
Expand Down
6 changes: 5 additions & 1 deletion core/src/main/java/org/apache/iceberg/DataFiles.java
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ public static class Builder {
private Map<Integer, Long> columnSizes = null;
private Map<Integer, Long> valueCounts = null;
private Map<Integer, Long> nullValueCounts = null;
private Map<Integer, Long> nanValueCounts = null;
private Map<Integer, ByteBuffer> lowerBounds = null;
private Map<Integer, ByteBuffer> upperBounds = null;
private ByteBuffer keyMetadata = null;
Expand All @@ -149,6 +150,7 @@ public void clear() {
this.columnSizes = null;
this.valueCounts = null;
this.nullValueCounts = null;
this.nanValueCounts = null;
this.lowerBounds = null;
this.upperBounds = null;
this.splitOffsets = null;
Expand All @@ -166,6 +168,7 @@ public Builder copy(DataFile toCopy) {
this.columnSizes = toCopy.columnSizes();
this.valueCounts = toCopy.valueCounts();
this.nullValueCounts = toCopy.nullValueCounts();
this.nanValueCounts = toCopy.nanValueCounts();
this.lowerBounds = toCopy.lowerBounds();
this.upperBounds = toCopy.upperBounds();
this.keyMetadata = toCopy.keyMetadata() == null ? null
Expand Down Expand Up @@ -241,6 +244,7 @@ public Builder withMetrics(Metrics metrics) {
this.columnSizes = metrics.columnSizes();
this.valueCounts = metrics.valueCounts();
this.nullValueCounts = metrics.nullValueCounts();
this.nanValueCounts = metrics.nanValueCounts();
this.lowerBounds = metrics.lowerBounds();
this.upperBounds = metrics.upperBounds();
return this;
Expand Down Expand Up @@ -276,7 +280,7 @@ public DataFile build() {
return new GenericDataFile(
specId, filePath, format, isPartitioned ? partitionData.copy() : null,
fileSizeInBytes, new Metrics(
recordCount, columnSizes, valueCounts, nullValueCounts, lowerBounds, upperBounds),
recordCount, columnSizes, valueCounts, nullValueCounts, nanValueCounts, lowerBounds, upperBounds),
keyMetadata, splitOffsets);
}
}
Expand Down
2 changes: 1 addition & 1 deletion core/src/main/java/org/apache/iceberg/DataTableScan.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ public class DataTableScan extends BaseTableScan {
);
static final ImmutableList<String> SCAN_WITH_STATS_COLUMNS = ImmutableList.<String>builder()
.addAll(SCAN_COLUMNS)
.add("value_counts", "null_value_counts", "lower_bounds", "upper_bounds", "column_sizes")
.add("value_counts", "null_value_counts", "nan_value_counts", "lower_bounds", "upper_bounds", "column_sizes")
.build();
static final boolean PLAN_SCANS_WITH_WORKER_POOL =
SystemProperties.getBoolean(SystemProperties.SCAN_THREAD_POOL_ENABLED, true);
Expand Down
6 changes: 5 additions & 1 deletion core/src/main/java/org/apache/iceberg/FileMetadata.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ public static class Builder {
private Map<Integer, Long> columnSizes = null;
private Map<Integer, Long> valueCounts = null;
private Map<Integer, Long> nullValueCounts = null;
private Map<Integer, Long> nanValueCounts = null;
private Map<Integer, ByteBuffer> lowerBounds = null;
private Map<Integer, ByteBuffer> upperBounds = null;
private ByteBuffer keyMetadata = null;
Expand All @@ -76,6 +77,7 @@ public void clear() {
this.columnSizes = null;
this.valueCounts = null;
this.nullValueCounts = null;
this.nanValueCounts = null;
this.lowerBounds = null;
this.upperBounds = null;
}
Expand All @@ -93,6 +95,7 @@ public Builder copy(DeleteFile toCopy) {
this.columnSizes = toCopy.columnSizes();
this.valueCounts = toCopy.valueCounts();
this.nullValueCounts = toCopy.nullValueCounts();
this.nanValueCounts = toCopy.nanValueCounts();
this.lowerBounds = toCopy.lowerBounds();
this.upperBounds = toCopy.upperBounds();
this.keyMetadata = toCopy.keyMetadata() == null ? null
Expand Down Expand Up @@ -179,6 +182,7 @@ public Builder withMetrics(Metrics metrics) {
this.columnSizes = metrics.columnSizes();
this.valueCounts = metrics.valueCounts();
this.nullValueCounts = metrics.nullValueCounts();
this.nanValueCounts = metrics.nanValueCounts();
this.lowerBounds = metrics.lowerBounds();
this.upperBounds = metrics.upperBounds();
return this;
Expand Down Expand Up @@ -206,7 +210,7 @@ public DeleteFile build() {
return new GenericDeleteFile(
specId, content, filePath, format, isPartitioned ? DataFiles.copy(spec, partitionData) : null,
fileSizeInBytes, new Metrics(
recordCount, columnSizes, valueCounts, nullValueCounts, lowerBounds, upperBounds),
recordCount, columnSizes, valueCounts, nullValueCounts, nanValueCounts, lowerBounds, upperBounds),
equalityFieldIds, keyMetadata);
}
}
Expand Down
2 changes: 1 addition & 1 deletion core/src/main/java/org/apache/iceberg/GenericDataFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class GenericDataFile extends BaseFile<DataFile> implements DataFile {
long fileSizeInBytes, Metrics metrics,
ByteBuffer keyMetadata, List<Long> splitOffsets) {
super(specId, FileContent.DATA, filePath, format, partition, fileSizeInBytes, metrics.recordCount(),
metrics.columnSizes(), metrics.valueCounts(), metrics.nullValueCounts(),
metrics.columnSizes(), metrics.valueCounts(), metrics.nullValueCounts(), metrics.nanValueCounts(),
metrics.lowerBounds(), metrics.upperBounds(), splitOffsets, null, keyMetadata);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class GenericDeleteFile extends BaseFile<DeleteFile> implements DeleteFile {
GenericDeleteFile(int specId, FileContent content, String filePath, FileFormat format, PartitionData partition,
long fileSizeInBytes, Metrics metrics, int[] equalityFieldIds, ByteBuffer keyMetadata) {
super(specId, content, filePath, format, partition, fileSizeInBytes, metrics.recordCount(),
metrics.columnSizes(), metrics.valueCounts(), metrics.nullValueCounts(),
metrics.columnSizes(), metrics.valueCounts(), metrics.nullValueCounts(), metrics.nanValueCounts(),
metrics.lowerBounds(), metrics.upperBounds(), null, equalityFieldIds, keyMetadata);
}

Expand Down
2 changes: 1 addition & 1 deletion core/src/main/java/org/apache/iceberg/ManifestReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public class ManifestReader<F extends ContentFile<F>>
extends CloseableGroup implements CloseableIterable<F> {
static final ImmutableList<String> ALL_COLUMNS = ImmutableList.of("*");
static final Set<String> STATS_COLUMNS = Sets.newHashSet(
"value_counts", "null_value_counts", "lower_bounds", "upper_bounds");
"value_counts", "null_value_counts", "nan_value_counts", "lower_bounds", "upper_bounds");

protected enum FileType {
DATA_FILES(GenericDataFile.class.getName()),
Expand Down
12 changes: 7 additions & 5 deletions core/src/main/java/org/apache/iceberg/MetricsModes.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

/**
* This class defines different metrics modes, which allow users to control the collection of
* value_counts, null_value_counts, lower_bounds, upper_bounds for different columns in metadata.
* value_counts, null_value_counts, nan_value_counts, lower_bounds, upper_bounds for different columns in metadata.
*/
public class MetricsModes {

Expand Down Expand Up @@ -60,7 +60,7 @@ public interface MetricsMode extends Serializable {
}

/**
* Under this mode, value_counts, null_value_counts, lower_bounds, upper_bounds are not persisted.
* Under this mode, value_counts, null_value_counts, nan_value_counts, lower_bounds, upper_bounds are not persisted.
*/
public static class None extends ProxySerializableMetricsMode {
private static final None INSTANCE = new None();
Expand All @@ -76,7 +76,7 @@ public String toString() {
}

/**
* Under this mode, only value_counts, null_value_counts are persisted.
* Under this mode, only value_counts, null_value_counts, nan_value_counts are persisted.
*/
public static class Counts extends ProxySerializableMetricsMode {
private static final Counts INSTANCE = new Counts();
Expand All @@ -92,7 +92,8 @@ public String toString() {
}

/**
* Under this mode, value_counts, null_value_counts and truncated lower_bounds, upper_bounds are persisted.
* Under this mode, value_counts, null_value_counts, nan_value_counts
* and truncated lower_bounds, upper_bounds are persisted.
*/
public static class Truncate extends ProxySerializableMetricsMode {
private final int length;
Expand Down Expand Up @@ -133,7 +134,8 @@ public int hashCode() {
}

/**
* Under this mode, value_counts, null_value_counts and full lower_bounds, upper_bounds are persisted.
* Under this mode, value_counts, null_value_counts, nan_value_counts
* and full lower_bounds, upper_bounds are persisted.
*/
public static class Full extends ProxySerializableMetricsMode {
private static final Full INSTANCE = new Full();
Expand Down
14 changes: 11 additions & 3 deletions core/src/main/java/org/apache/iceberg/V1Metadata.java
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ static Types.StructType dataFileSchema(Types.StructType partitionType) {
DataFile.COLUMN_SIZES,
DataFile.VALUE_COUNTS,
DataFile.NULL_VALUE_COUNTS,
DataFile.NAN_VALUE_COUNTS,
DataFile.LOWER_BOUNDS,
DataFile.UPPER_BOUNDS,
DataFile.KEY_METADATA,
Expand Down Expand Up @@ -343,12 +344,14 @@ public Object get(int pos) {
case 8:
return wrapped.nullValueCounts();
case 9:
return wrapped.lowerBounds();
return wrapped.nanValueCounts();
case 10:
return wrapped.upperBounds();
return wrapped.lowerBounds();
case 11:
return wrapped.keyMetadata();
return wrapped.upperBounds();
case 12:
return wrapped.keyMetadata();
case 13:
return wrapped.splitOffsets();
}
throw new IllegalArgumentException("Unknown field ordinal: " + pos);
Expand Down Expand Up @@ -419,6 +422,11 @@ public Map<Integer, Long> nullValueCounts() {
return wrapped.nullValueCounts();
}

@Override
public Map<Integer, Long> nanValueCounts() {
return wrapped.nanValueCounts();
}

@Override
public Map<Integer, ByteBuffer> lowerBounds() {
return wrapped.lowerBounds();
Expand Down
Loading