From 207fc763ca80b65b3bf880bbdb8d408ab09e7799 Mon Sep 17 00:00:00 2001 From: ZhongYujiang <42907416+zhongyujiang@users.noreply.github.com> Date: Tue, 14 Feb 2023 19:43:59 +0800 Subject: [PATCH 01/21] Parquet: Implement column index filter. --- .../apache/iceberg/parquet/IndexIterator.java | 75 ++ .../parquet/ParquetColumnIndexFilter.java | 629 +++++++++++++++ .../parquet/TestColumnIndexFilter.java | 738 ++++++++++++++++++ 3 files changed, 1442 insertions(+) create mode 100644 parquet/src/main/java/org/apache/iceberg/parquet/IndexIterator.java create mode 100644 parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java create mode 100644 parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/IndexIterator.java b/parquet/src/main/java/org/apache/iceberg/parquet/IndexIterator.java new file mode 100644 index 000000000000..a9fad04ace62 --- /dev/null +++ b/parquet/src/main/java/org/apache/iceberg/parquet/IndexIterator.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.parquet; + +import java.util.NoSuchElementException; +import java.util.PrimitiveIterator; +import java.util.function.IntPredicate; +import java.util.function.IntUnaryOperator; + +/** + * Iterator implementation for page indexes. + */ +class IndexIterator implements PrimitiveIterator.OfInt { + private final int endIndex; + private final IntPredicate filter; + private final IntUnaryOperator translator; + private int index; + + private IndexIterator(int startIndex, int endIndex, IntPredicate filter, IntUnaryOperator translator) { + this.endIndex = endIndex; + this.filter = filter; + this.translator = translator; + index = nextPageIndex(startIndex); + } + + static PrimitiveIterator.OfInt all(int pageCount) { + return new IndexIterator(0, pageCount, i -> true, i -> i); + } + + static PrimitiveIterator.OfInt filter(int pageCount, IntPredicate filter) { + return new IndexIterator(0, pageCount, filter, i -> i); + } + + private int nextPageIndex(int startIndex) { + for (int i = startIndex; i < endIndex; ++i) { + if (filter.test(i)) { + return i; + } + } + return -1; + } + + @Override + public boolean hasNext() { + return index >= 0; + } + + @Override + public int nextInt() { + if (hasNext()) { + int ret = index; + index = nextPageIndex(index + 1); + return translator.applyAsInt(ret); + } + throw new NoSuchElementException(); + } +} + diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java new file mode 100644 index 000000000000..c466d17cae17 --- /dev/null +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java @@ -0,0 +1,629 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.parquet; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.PrimitiveIterator; +import java.util.Set; +import java.util.function.Function; +import java.util.function.IntPredicate; +import org.apache.iceberg.Schema; +import org.apache.iceberg.expressions.Binder; +import org.apache.iceberg.expressions.BoundReference; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.ExpressionVisitors; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.expressions.Literal; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.types.Comparators; +import org.apache.iceberg.types.Conversions; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.BinaryUtil; +import org.apache.iceberg.util.Pair; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; +import org.apache.parquet.internal.filter2.columnindex.RowRanges; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class ParquetColumnIndexFilter { + + private static final Logger LOG = LoggerFactory.getLogger(ParquetColumnIndexFilter.class); + + private final Schema schema; + private final Expression expr; + + public ParquetColumnIndexFilter(Schema schema, Expression unbound, boolean caseSensitive) { + this.schema = schema; + this.expr = Binder.bind(schema.asStruct(), Expressions.rewriteNot(unbound), caseSensitive); + } + + /** + * Calculates the row ranges containing the indexes of the rows might match the expression. + * @param typeWithIds schema for the Parquet file with Iceberg type IDs + * @param columnIndexStore the store for providing column/offset indexes + * @param rowCount the total number of rows in the row-group + * @return the ranges of the possible matching row indexes; the returned ranges will contain all the rows + * if any of the required offset index is missing + */ + public RowRanges calculateRowRanges(MessageType typeWithIds, ColumnIndexStore columnIndexStore, long rowCount) { + try { + return new ColumnIndexEvalVisitor(typeWithIds, columnIndexStore, rowCount).eval(); + } catch (ColumnIndexStore.MissingOffsetIndexException e) { + LOG.info("Cannot get required offset index; Unable to filter on this row group", e); + return RowRanges.createSingle(rowCount); + } + } + + private static final boolean ROWS_MIGHT_MATCH = true; + private static final boolean ROWS_CANNOT_MATCH = false; + private static final RowRanges NO_ROWS = RowRanges.EMPTY; + + private class ColumnIndexEvalVisitor extends ExpressionVisitors.BoundExpressionVisitor { + + private final Map idToColumn = Maps.newHashMap(); + private final Map idToColumnIndex = Maps.newHashMap(); + private final Map idToOffsetIndex = Maps.newHashMap(); + private final Map> conversions = Maps.newHashMap(); + + private final RowRanges allRows; + private final ColumnIndexStore columnIndexStore; + private final long rowCount; + + private ColumnIndexEvalVisitor(MessageType typeWithIds, ColumnIndexStore columnIndexStore, long rowCount) { + this.allRows = RowRanges.createSingle(rowCount); + this.columnIndexStore = columnIndexStore; + this.rowCount = rowCount; + idByColumnPath(typeWithIds.asGroupType(), null, idToColumn); + } + + private RowRanges eval() { + return ExpressionVisitors.visit(expr, this); + } + + private void idByColumnPath(GroupType type, String parent, Map idToColumnPath) { + String prefix = parent == null ? "" : parent + "."; + for (org.apache.parquet.schema.Type field : type.getFields()) { + if (field.isPrimitive()) { + idToColumnPath.put(field.getId().intValue(), ColumnPath.fromDotString(prefix + field.getName())); + } else { + idByColumnPath(field.asGroupType(), prefix, idToColumnPath); + } + } + } + + @Override + public RowRanges alwaysTrue() { + return allRows; + } + + @Override + public RowRanges alwaysFalse() { + return NO_ROWS; + } + + @Override + public RowRanges not(RowRanges result) { + // The resulting row ranges for column index filter calculations is overestimated, + // so evaluation of NOT expressions is not supported + throw new UnsupportedOperationException("Cannot support evaluating NOT"); + } + + @Override + public RowRanges and(RowRanges left, RowRanges right) { + return RowRanges.intersection(left, right); + } + + @Override + public RowRanges or(RowRanges left, RowRanges right) { + return RowRanges.union(left, right); + } + + @Override + public RowRanges isNull(BoundReference ref) { + int id = ref.fieldId(); + + Function func = columnIndex -> { + if (columnIndex.hasNullCounts()) { + return IndexIterator.filter(columnIndex.pageCount(), columnIndex::containsNull); + } else { + // Searching for nulls so if we don't have null related statistics we have to return all pages + return IndexIterator.all(columnIndex.pageCount()); + } + }; + + return applyPredicate(id, func, ROWS_MIGHT_MATCH); + } + + @Override + public RowRanges notNull(BoundReference ref) { + int id = ref.fieldId(); + + // When filtering nested types notNull() is implicit filter passed even though complex + // filters aren't pushed down in Parquet. Leave all nested column type filters to be + // evaluated post scan. + if (schema.findType(id) instanceof Type.NestedType) { + return allRows; + } + + Function func = + columnIndex -> IndexIterator.filter(columnIndex.pageCount(), columnIndex::isNonNullPage); + + return applyPredicate(id, func, ROWS_CANNOT_MATCH); + } + + @Override + public RowRanges isNaN(BoundReference ref) { + int id = ref.fieldId(); + + Function func = + columnIndex -> IndexIterator.filter(columnIndex.pageCount(), columnIndex::isNonNullPage); + + return applyPredicate(id, func, ROWS_CANNOT_MATCH); + } + + @Override + public RowRanges notNaN(BoundReference ref) { + // Parquet column index does not contain statistics about NaN values, so cannot filter out any pages. + return allRows; + } + + @Override + public RowRanges lt(BoundReference ref, Literal lit) { + int id = ref.fieldId(); + + Function func = columnIndex -> { + + IntPredicate filter = pageIndex -> { + if (columnIndex.isNullPage(pageIndex)) { + return ROWS_CANNOT_MATCH; + } + + T lower = (T) columnIndex.min(pageIndex); + if (lit.comparator().compare(lower, lit.value()) >= 0) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + }; + + return IndexIterator.filter(columnIndex.pageCount(), filter); + }; + + return applyPredicate(id, func, ROWS_CANNOT_MATCH); + } + + @Override + public RowRanges ltEq(BoundReference ref, Literal lit) { + int id = ref.fieldId(); + + Function func = columnIndex -> { + + IntPredicate filter = pageIndex -> { + if (columnIndex.isNullPage(pageIndex)) { + return ROWS_CANNOT_MATCH; + } + + T lower = (T) columnIndex.min(pageIndex); + if (lit.comparator().compare(lower, lit.value()) > 0) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + }; + + return IndexIterator.filter(columnIndex.pageCount(), filter); + }; + + return applyPredicate(id, func, ROWS_CANNOT_MATCH); + } + + @Override + public RowRanges gt(BoundReference ref, Literal lit) { + int id = ref.fieldId(); + + Function func = columnIndex -> { + + IntPredicate filter = pageIndex -> { + if (columnIndex.isNullPage(pageIndex)) { + return ROWS_CANNOT_MATCH; + } + + T upper = (T) columnIndex.max(pageIndex); + if (lit.comparator().compare(upper, lit.value()) <= 0) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + }; + return IndexIterator.filter(columnIndex.pageCount(), filter); + }; + + return applyPredicate(id, func, ROWS_CANNOT_MATCH); + } + + @Override + public RowRanges gtEq(BoundReference ref, Literal lit) { + int id = ref.fieldId(); + + Function func = columnIndex -> { + + IntPredicate filter = pageIndex -> { + if (columnIndex.isNullPage(pageIndex)) { + return ROWS_CANNOT_MATCH; + } + + T upper = (T) columnIndex.max(pageIndex); + if (lit.comparator().compare(upper, lit.value()) < 0) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + }; + return IndexIterator.filter(columnIndex.pageCount(), filter); + }; + + return applyPredicate(id, func, ROWS_CANNOT_MATCH); + } + + @Override + public RowRanges eq(BoundReference ref, Literal lit) { + int id = ref.fieldId(); + + Function func = columnIndex -> { + + IntPredicate filter = pageIndex -> { + if (columnIndex.isNullPage(pageIndex)) { + return ROWS_CANNOT_MATCH; + } + + T lower = (T) columnIndex.min(pageIndex); + if (lit.comparator().compare(lower, lit.value()) > 0) { + return ROWS_CANNOT_MATCH; + } + + T upper = (T) columnIndex.max(pageIndex); + if (lit.comparator().compare(upper, lit.value()) < 0) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + }; + + return IndexIterator.filter(columnIndex.pageCount(), filter); + }; + + return applyPredicate(id, func, ROWS_CANNOT_MATCH); + } + + @Override + public RowRanges notEq(BoundReference ref, Literal lit) { + return allRows; + } + + @Override + public RowRanges in(BoundReference ref, Set literalSet) { + int id = ref.fieldId(); + Pair minMax = minMax(ref.comparator(), literalSet); + + Function func = columnIndex -> { + + IntPredicate filter = pageIndex -> { + if (columnIndex.isNullPage(pageIndex)) { + return ROWS_CANNOT_MATCH; + } + + T lower = (T) columnIndex.min(pageIndex); + if (ref.comparator().compare(lower, minMax.second()) > 0) { + return ROWS_CANNOT_MATCH; + } + + T upper = (T) columnIndex.max(pageIndex); + if (ref.comparator().compare(upper, minMax.first()) < 0) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + }; + + return IndexIterator.filter(columnIndex.pageCount(), filter); + }; + + return applyPredicate(id, func, ROWS_CANNOT_MATCH); + } + + private Pair minMax(Comparator comparator, Set literalSet) { + T min = null; + T max = null; + + for (T item : literalSet) { + if (min == null) { + min = item; + max = item; + } else { + if (comparator.compare(item, min) < 0) { + min = item; + } else if (comparator.compare(item, max) > 0) { + max = item; + } + } + } + + return Pair.of(min, max); + } + + @Override + public RowRanges notIn(BoundReference ref, Set literalSet) { + return allRows; + } + + @Override + public RowRanges startsWith(BoundReference ref, Literal lit) { + int id = ref.fieldId(); + + Function func = columnIndex -> { + + ByteBuffer prefixAsBytes = lit.toByteBuffer(); + Comparator comparator = Comparators.unsignedBytes(); + + IntPredicate filter = pageIndex -> { + if (columnIndex.isNullPage(pageIndex)) { + return ROWS_CANNOT_MATCH; + } + + ByteBuffer lower = columnIndex.minBuffer(pageIndex); + + // truncate lower bound so that its length in bytes is not greater than the length of prefix + int lowerLength = Math.min(prefixAsBytes.remaining(), lower.remaining()); + int lowerCmp = comparator.compare(BinaryUtil.truncateBinary(lower, lowerLength), prefixAsBytes); + if (lowerCmp > 0) { + return ROWS_CANNOT_MATCH; + } + + ByteBuffer upper = columnIndex.maxBuffer(pageIndex); + // truncate upper bound so that its length in bytes is not greater than the length of prefix + int upperLength = Math.min(prefixAsBytes.remaining(), upper.remaining()); + int upperCmp = comparator.compare(BinaryUtil.truncateBinary(upper, upperLength), prefixAsBytes); + if (upperCmp < 0) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + }; + + return IndexIterator.filter(columnIndex.pageCount(), filter); + }; + + return applyPredicate(id, func, ROWS_CANNOT_MATCH); + } + + @Override + public RowRanges notStartsWith(BoundReference ref, Literal lit) { + int id = ref.fieldId(); + + Function func = columnIndex -> { + IntPredicate filter; + if (columnIndex.hasNullCounts()) { + ByteBuffer prefixAsBytes = lit.toByteBuffer(); + Comparator comparator = Comparators.unsignedBytes(); + + filter = pageIndex -> { + if (columnIndex.containsNull(pageIndex)) { + return ROWS_MIGHT_MATCH; + } + + ByteBuffer lower = columnIndex.minBuffer(pageIndex); + // if lower is shorter than the prefix, it can't start with the prefix + if (lower.remaining() < prefixAsBytes.remaining()) { + return ROWS_MIGHT_MATCH; + } + + // truncate lower bound so that its length in bytes is not greater than the length of prefix + int cmp = comparator.compare(BinaryUtil.truncateBinary(lower, prefixAsBytes.remaining()), prefixAsBytes); + + if (cmp == 0) { + ByteBuffer upper = columnIndex.maxBuffer(pageIndex); + // the lower bound starts with the prefix; check the upper bound + // if upper is shorter than the prefix, it can't start with the prefix + if (upper.remaining() < prefixAsBytes.remaining()) { + return ROWS_MIGHT_MATCH; + } + + // truncate upper bound so that its length in bytes is not greater than the length of prefix + cmp = comparator.compare(BinaryUtil.truncateBinary(upper, prefixAsBytes.remaining()), prefixAsBytes); + if (cmp == 0) { + // both bounds match the prefix, so all rows must match the prefix and none do not match + return ROWS_CANNOT_MATCH; + } + } + + return ROWS_MIGHT_MATCH; + }; + } else { + // Return all pages if we don't have null counts statistics + filter = pageIndex -> ROWS_MIGHT_MATCH; + } + + return IndexIterator.filter(columnIndex.pageCount(), filter); + }; + + return applyPredicate(id, func, ROWS_MIGHT_MATCH); + } + + private RowRanges applyPredicate(int columnId, + Function func, + boolean missingColumnMightMatch) { + + if (!idToColumn.containsKey(columnId)) { + return missingColumnMightMatch ? allRows : NO_ROWS; + } + + // Get the offset index first so that the MissingOffsetIndexException (if any) is thrown ASAP + OffsetIndex offsetIndex = offsetIndex(columnId); + ColumnIndexWrapper columnIndex = columnIndex(columnId); + if (columnIndex == null) { + LOG.info("No column index for column {} is available; Unable to filter on this column", + idToColumn.get(columnId)); + return allRows; + } + + return RowRanges.create(rowCount, func.apply(columnIndex), offsetIndex); + } + + // Assumes that the column corresponding to the id exists in the file. + private OffsetIndex offsetIndex(int columnId) { + return idToOffsetIndex.computeIfAbsent(columnId, k -> columnIndexStore.getOffsetIndex(idToColumn.get(k))); + } + + // Assumes that the column corresponding to the id exists in the file. + private ColumnIndexWrapper columnIndex(int columnId) { + ColumnIndexWrapper wrapper = idToColumnIndex.get(columnId); + + if (wrapper == null) { + ColumnIndex columnIndex = columnIndexStore.getColumnIndex(idToColumn.get(columnId)); + if (columnIndex != null) { + wrapper = new ColumnIndexWrapper(columnIndex, conversion(columnId)); + idToColumnIndex.put(columnId, wrapper); + } + } + + return wrapper; + } + + // Assumes that the field corresponding to the id exists in the Iceberg schema. + private Function conversion(int columnId) { + Function conversion = conversions.get(columnId); + + if (conversion == null) { + Type type = schema.findType(columnId); + Function bytesReorder; + if (type == Types.UUIDType.get() || type instanceof Types.DecimalType) { + // The buffers returned by Parquet are all in little-endian byte order, + // but Conversions#fromByteBuffer use big-endian byte order for UUIDs and Decimals. + bytesReorder = buffer -> toBigEndian(buffer); + } else { + bytesReorder = Function.identity(); + } + + conversion = bytesReorder.andThen(buffer -> Conversions.fromByteBuffer(type, buffer)); + conversions.put(columnId, conversion); + } + + return conversion; + } + } + + private static ByteBuffer toBigEndian(ByteBuffer buffer) { + int size = buffer.remaining(); + ByteBuffer bigEndian = ByteBuffer.allocate(size).order(ByteOrder.BIG_ENDIAN); + for (int i = 0; i < size; i += 1) { + bigEndian.put(i, buffer.get(size - 1 - i)); + } + + return bigEndian; + } + + /** + * A wrapper for ColumnIndex, which will cache statistics data and convert min max buffers to Iceberg type values. + */ + private static class ColumnIndexWrapper { + private final ColumnIndex columnIndex; + private final Function conversion; + + private List nullPages; + private List minBuffers; + private List maxBuffers; + private List nullCounts; // optional field + + private ColumnIndexWrapper(ColumnIndex columnIndex, Function conversion) { + this.columnIndex = columnIndex; + this.conversion = conversion; + } + + private ByteBuffer minBuffer(int pageIndex) { + if (minBuffers == null) { + minBuffers = columnIndex.getMinValues(); + } + + return minBuffers.get(pageIndex); + } + + private ByteBuffer maxBuffer(int pageIndex) { + if (maxBuffers == null) { + maxBuffers = columnIndex.getMaxValues(); + } + + return maxBuffers.get(pageIndex); + } + + private List nullPages() { + if (nullPages == null) { + nullPages = columnIndex.getNullPages(); + } + + return nullPages; + } + + private Object min(int pageIndex) { + return conversion.apply(minBuffer(pageIndex)); + } + + private Object max(int pageIndex) { + return conversion.apply(maxBuffer(pageIndex)); + } + + private Boolean isNullPage(int pageIndex) { + return nullPages().get(pageIndex); + } + + private Boolean isNonNullPage(int pageIndex) { + return !nullPages().get(pageIndex); + } + + private boolean hasNullCounts() { + if (nullCounts == null) { + nullCounts = columnIndex.getNullCounts(); + } + + return nullCounts != null; + } + + private boolean containsNull(int pageIndex) { + if (hasNullCounts()) { + return nullCounts.get(pageIndex) > 0; + } + + throw new UnsupportedOperationException("Has no null counts statistics"); + } + + private int pageCount() { + return nullPages().size(); + } + } +} diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java new file mode 100644 index 000000000000..836ef87b19ad --- /dev/null +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java @@ -0,0 +1,738 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.parquet; + +import java.nio.ByteBuffer; +import java.util.List; +import java.util.Locale; +import java.util.PrimitiveIterator; +import org.apache.iceberg.Schema; +import org.apache.iceberg.exceptions.ValidationException; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.apache.parquet.bytes.BytesUtils; +import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.internal.column.columnindex.BoundaryOrder; +import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; +import org.apache.parquet.internal.filter2.columnindex.RowRanges; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.junit.Assert; +import org.junit.Test; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.greaterThan; +import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.isNaN; +import static org.apache.iceberg.expressions.Expressions.isNull; +import static org.apache.iceberg.expressions.Expressions.lessThan; +import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.not; +import static org.apache.iceberg.expressions.Expressions.notEqual; +import static org.apache.iceberg.expressions.Expressions.notNaN; +import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.notStartsWith; +import static org.apache.iceberg.expressions.Expressions.startsWith; +import static org.apache.parquet.internal.column.columnindex.BoundaryOrder.ASCENDING; +import static org.apache.parquet.internal.column.columnindex.BoundaryOrder.DESCENDING; +import static org.apache.parquet.internal.column.columnindex.BoundaryOrder.UNORDERED; +import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; +import static org.apache.parquet.schema.Types.optional; + +public class TestColumnIndexFilter { + /** + * COPIED FROM org.apache.parquet.internal.filter2.columnindex.TestColumnIndexFilter + **/ + private static final long TOTAL_ROW_COUNT = 30; + private static final String INT_COL = "int_col"; + private static final String STR_COL = "str_col"; + private static final String NO_NANS = "no_nans"; + private static final String NO_CI = "no_ci"; + private static final String ALL_NULLS = "all_nulls"; + private static final String ALL_NANS = "all_nans"; + private static final String NOT_IN_FILE = "not_in_file"; + private static final ColumnIndex INT_COL_CI = new CIBuilder(optional(INT32).named(INT_COL), ASCENDING) + .addPage(0, 1, 1) + .addPage(1, 2, 6) + .addPage(0, 7, 7) + .addPage(1, 7, 10) + .addPage(0, 11, 17) + .addPage(0, 18, 23) + .addPage(0, 24, 26) + .build(); + private static final OffsetIndex INT_COL_OI = new OIBuilder() + .addPage(1) + .addPage(6) + .addPage(2) + .addPage(5) + .addPage(7) + .addPage(6) + .addPage(3) + .build(); + private static final ColumnIndex STR_COL_CI = + new CIBuilder(optional(BINARY).as(stringType()).named(STR_COL), DESCENDING) + .addPage(0, "Zulu", "Zulu") + .addPage(0, "Whiskey", "Yankee") + .addPage(1, "Tango", "Victor") + .addNullPage(3) + .addPage(0, "Oscar", "Sierra") + .addPage(0, "Juliett", "November") + .addPage(0, "Bravo", "India") + .addPage(0, "Alfa", "Alfa") + .build(); + private static final OffsetIndex STR_COL_OI = new OIBuilder() + .addPage(1) + .addPage(3) + .addPage(4) + .addPage(3) + .addPage(5) + .addPage(5) + .addPage(8) + .addPage(1) + .build(); + private static final ColumnIndex NO_NANS_CI = new CIBuilder(optional(DOUBLE).named(NO_NANS), UNORDERED) + .addPage(0, 2.03, 2.03) + .addPage(0, 0.56, 8.71) + .addPage(2, 3.14, 3.50) + .addPage(0, 2.71, 9.99) + .addPage(3, 0.36, 5.32) + .addPage(0, 4.17, 7.95) + .addNullPage(4) + .build(); + private static final OffsetIndex NO_NANS_OI = new OIBuilder() + .addPage(1) + .addPage(5) + .addPage(4) + .addPage(6) + .addPage(7) + .addPage(3) + .addPage(4) + .build(); + private static final ColumnIndex NO_CI_CI = null; + private static final OffsetIndex NO_CI_OI = new OIBuilder() + .addPage(1) + .addPage(3) + .addPage(2) + .addPage(1) + .addPage(5) + .addPage(4) + .addPage(5) + .addPage(7) + .addPage(2) + .build(); + private static final ColumnIndex ALL_NULLS_CI = new CIBuilder(optional(INT64).named(ALL_NULLS), ASCENDING) + .addNullPage(1) + .addNullPage(29) + .build(); + private static final OffsetIndex ALL_NULLS_OI = new OIBuilder() + .addPage(1) + .addPage(29) + .build(); + private static final ColumnIndex ALL_NANS_CI = new CIBuilder(optional(DOUBLE).named(ALL_NANS), UNORDERED) + .addPage(1, Double.NaN, Double.NaN) + .addPage(29, Double.NaN, Double.NaN) + .build(); + private static final ColumnIndexStore STORE = new ColumnIndexStore() { + @Override + public ColumnIndex getColumnIndex(ColumnPath column) { + switch (column.toDotString()) { + case INT_COL: + return INT_COL_CI; + case STR_COL: + return STR_COL_CI; + case NO_NANS: + return NO_NANS_CI; + case NO_CI: + return NO_CI_CI; + case ALL_NULLS: + return ALL_NULLS_CI; + case ALL_NANS: + return ALL_NANS_CI; + default: + return null; + } + } + + @Override + public OffsetIndex getOffsetIndex(ColumnPath column) { + switch (column.toDotString()) { + case INT_COL: + return INT_COL_OI; + case STR_COL: + return STR_COL_OI; + case NO_NANS: + return NO_NANS_OI; + case NO_CI: + return NO_CI_OI; + case ALL_NULLS: + return ALL_NULLS_OI; + case ALL_NANS: + return ALL_NANS_OI; + default: + throw new MissingOffsetIndexException(column); + } + } + }; + private static final OffsetIndex ALL_NANS_OI = new OIBuilder() + .addPage(1) + .addPage(29) + .build(); + /** + *
+   * row   int_col       str_col        no_nans        no_ci          all_nulls      all_nans
+   *                                                 (no column index)
+   *      ------0------  ------0------  ------0------  ------0------  ------0------  ------0------
+   * 0.   1              Zulu           2.03                          null           NaN
+   *      ------1------  ------1------  ------1------  ------1------  ------1------  ------1------
+   * 1.   2              Yankee         4.67                          null           NaN
+   * 2.   3              Xray           3.42                          null           NaN
+   * 3.   4              Whiskey        8.71                          null           NaN
+   *                     ------2------                 ------2------
+   * 4.   5              Victor         0.56                          null           NaN
+   * 5.   6              Uniform        4.30                          null           NaN
+   *                                    ------2------  ------3------
+   * 6.   null           null           null                          null           NaN
+   *      ------2------                                ------4------
+   * 7.   7              Tango          3.50                          null           NaN
+   *                     ------3------
+   * 8.   7              null           3.14                          null           NaN
+   *      ------3------k
+   * 9.   7              null           null                          null           NaN
+   *                                    ------3------
+   * 10.  null           null           9.99                          null           NaN
+   *                     ------4------
+   * 11.  8              Sierra         8.78                          null           NaN
+   *                                                   ------5------
+   * 12.  9              Romeo          9.56                          null           NaN
+   * 13.  10             Quebec         2.71                          null           NaN
+   *      ------4------
+   * 14.  11             Papa           5.71                          null           NaN
+   * 15.  12             Oscar          4.09                          null           NaN
+   *                     ------5------  ------4------  ------6------
+   * 16.  13             November       null                          null           NaN
+   * 17.  14             Mike           null                          null           NaN
+   * 18.  15             Lima           0.36                          null           NaN
+   * 19.  16             Kilo           2.94                          null           NaN
+   * 20.  17             Juliett        4.23                          null           NaN
+   *      ------5------  ------6------                 ------7------
+   * 21.  18             India          null                          null           NaN
+   * 22.  19             Hotel          5.32                          null           NaN
+   *                                    ------5------
+   * 23.  20             Golf           4.17                          null           NaN
+   * 24.  21             Foxtrot        7.92                          null           NaN
+   * 25.  22             Echo           7.95                          null           NaN
+   *                                   ------6------
+   * 26.  23             Delta          null                          null           NaN
+   *      ------6------
+   * 27.  24             Charlie        null                          null           NaN
+   *                                                   ------8------
+   * 28.  25             Bravo          null                          null           NaN
+   *                     ------7------
+   * 29.  26             Alfa           null                          null           NaN
+   * 
+ */ + + private static final Schema SCHEMA = new Schema( + Types.NestedField.optional(1, INT_COL, Types.IntegerType.get()), + Types.NestedField.optional(2, STR_COL, Types.StringType.get()), + Types.NestedField.optional(3, NO_NANS, Types.DoubleType.get()), + Types.NestedField.optional(4, NO_CI, Types.IntegerType.get()), + Types.NestedField.optional(5, ALL_NULLS, Types.LongType.get()), + Types.NestedField.optional(6, ALL_NANS, Types.DoubleType.get()), + Types.NestedField.optional(7, NOT_IN_FILE, Types.LongType.get()) + ); + private static final Schema SCHEMA_MISSING_COLUMN = new Schema( + Types.NestedField.optional(1, INT_COL, Types.IntegerType.get()), + Types.NestedField.optional(2, STR_COL, Types.StringType.get()), + Types.NestedField.optional(3, NO_NANS, Types.DoubleType.get()), + Types.NestedField.optional(4, NO_CI, Types.IntegerType.get()), + Types.NestedField.optional(5, ALL_NULLS, Types.LongType.get()), + Types.NestedField.optional(6, ALL_NANS, Types.DoubleType.get()) + ); + + /** END **/ + + private static final MessageType FILE_SCHEMA = ParquetSchemaUtil.convert(SCHEMA_MISSING_COLUMN, "table"); + private static final RowRanges ALL_ROWS = RowRanges.createSingle(TOTAL_ROW_COUNT); + private static final RowRanges NO_ROWS = RowRanges.EMPTY; + + private static RowRanges createRowRanges(String path, Integer... pageIndexes) { + return RowRanges.create(TOTAL_ROW_COUNT, new PrimitiveIterator.OfInt() { + int index = -1; + + @Override + public int nextInt() { + return pageIndexes[index]; + } + + @Override + public boolean hasNext() { + index += 1; + return index < pageIndexes.length; + } + }, STORE.getOffsetIndex(ColumnPath.fromDotString(path))); + } + + private boolean rowRangesEquals(RowRanges r1, RowRanges r2) { + if (r1 == r2) { + return true; + } + + if (r1 == null || r2 == null) { + return false; + } + + List ranges1 = r1.getRanges(); + List ranges2 = r2.getRanges(); + + if (ranges1.size() != ranges2.size()) { + return false; + } + + for (int i = 0; i < ranges1.size(); i += 1) { + RowRanges.Range range1 = ranges1.get(i); + RowRanges.Range range2 = ranges2.get(i); + if (range1.from != range2.from || range1.to != range2.to) { + return false; + } + } + + return true; + } + + private void assertRowRangesEquals(RowRanges expected, RowRanges actual) { + if (!rowRangesEquals(expected, actual)) { + throw new AssertionError(String.format("RowRanges are not equal, expected: %s, actual: %s", + expected, actual)); + } + } + + private RowRanges calculateRowRanges(Expression expr) { + return calculateRowRanges(SCHEMA, expr, true); + } + + private RowRanges calculateRowRanges(Expression expr, boolean caseSensitive) { + return calculateRowRanges(SCHEMA, expr, caseSensitive); + } + + private RowRanges calculateRowRanges(Schema schema, Expression expr, boolean caseSensitive) { + return new ParquetColumnIndexFilter(schema, expr, caseSensitive) + .calculateRowRanges(FILE_SCHEMA, STORE, TOTAL_ROW_COUNT); + } + + private RowRanges calculateRowRanges(Schema schema, MessageType messageType, Expression expr, boolean caseSensitive) { + return new ParquetColumnIndexFilter(schema, expr, caseSensitive) + .calculateRowRanges(messageType, STORE, TOTAL_ROW_COUNT); + } + + @Test + public void testIsNulls() { + RowRanges expected; + + expected = createRowRanges(INT_COL, 1, 3); + assertRowRangesEquals(expected, calculateRowRanges(isNull(INT_COL))); + + expected = createRowRanges(STR_COL, 2, 3); + assertRowRangesEquals(expected, calculateRowRanges(isNull(STR_COL))); + + expected = createRowRanges(NO_NANS, 2, 4, 6); + assertRowRangesEquals(expected, calculateRowRanges(isNull(NO_NANS))); + + expected = ALL_ROWS; + assertRowRangesEquals(expected, calculateRowRanges(isNull(ALL_NULLS))); + } + + @Test + public void testNotNulls() { + RowRanges expected; + + expected = ALL_ROWS; + assertRowRangesEquals(expected, calculateRowRanges(notNull(INT_COL))); + + expected = createRowRanges(STR_COL, 0, 1, 2, 4, 5, 6, 7); + assertRowRangesEquals(expected, calculateRowRanges(notNull(STR_COL))); + + expected = createRowRanges(NO_NANS, 0, 1, 2, 3, 4, 5); + assertRowRangesEquals(expected, calculateRowRanges(notNull(NO_NANS))); + + expected = NO_ROWS; + assertRowRangesEquals(expected, calculateRowRanges(notNull(ALL_NULLS))); + } + + @Test + public void testIsNaN() { + RowRanges expected; + + // column index exists, null page 6 should be filtered out + expected = createRowRanges(NO_NANS, 0, 1, 2, 3, 4, 5); + assertRowRangesEquals(expected, calculateRowRanges(isNaN(NO_NANS))); + + assertRowRangesEquals(ALL_ROWS, calculateRowRanges(isNaN(ALL_NANS))); + } + + @Test + public void testNotNaN() { + RowRanges expected; + + expected = ALL_ROWS; + assertRowRangesEquals(expected, calculateRowRanges(notNaN(NO_NANS))); + + assertRowRangesEquals(expected, calculateRowRanges(notNaN(ALL_NANS))); + } + + @Test + public void testMissingColumn() { + Assert.assertThrows("Cannot find field 'missing'", + ValidationException.class, () -> calculateRowRanges(equal("missing", 0))); + } + + @Test + public void testColumnNotInFile() { + RowRanges expected; + + expected = NO_ROWS; + assertRowRangesEquals(expected, calculateRowRanges(notNull(NOT_IN_FILE))); + } + + @Test + public void testMissingColumnIndex() { + RowRanges expected = ALL_ROWS; + + assertRowRangesEquals(expected, calculateRowRanges(isNull(NO_CI))); + assertRowRangesEquals(expected, calculateRowRanges(notNull(NO_CI))); + assertRowRangesEquals(expected, calculateRowRanges(greaterThan(NO_CI, 9))); + assertRowRangesEquals(expected, calculateRowRanges(lessThan(NO_CI, 9))); + assertRowRangesEquals(expected, calculateRowRanges(equal(NO_CI, 9))); + assertRowRangesEquals(expected, calculateRowRanges(notEqual(NO_CI, 9))); + } + + @Test + public void testNot() { + // ColumnIndexEvalVisitor does not support evaluating NOT expression, but NOT should be rewritten + RowRanges expected; + + expected = ALL_ROWS; + assertRowRangesEquals(expected, calculateRowRanges(not(lessThan(INT_COL, 1)))); + + expected = createRowRanges(INT_COL, 1, 2, 3, 4, 5, 6); + assertRowRangesEquals(expected, calculateRowRanges(not(lessThanOrEqual(INT_COL, 1)))); + } + + @Test + public void testAnd() { + RowRanges expected; + Expression expr; + + expected = NO_ROWS; + expr = Expressions.and(equal(INT_COL, 1), equal(INT_COL, 2)); + assertRowRangesEquals(expected, calculateRowRanges(expr)); + + expr = Expressions.and(equal(INT_COL, 1), equal(STR_COL, "Alfa")); + assertRowRangesEquals(expected, calculateRowRanges(expr)); + + expr = Expressions.and(equal(INT_COL, 2), equal(STR_COL, "Tango")); + expected = RowRanges.intersection(createRowRanges(INT_COL, 1), createRowRanges(STR_COL, 2)); + assertRowRangesEquals(expected, calculateRowRanges(expr)); + } + + @Test + public void testOr() { + RowRanges expected; + Expression expr; + + expected = createRowRanges(INT_COL, 0, 1); + expr = Expressions.or(equal(INT_COL, 1), equal(INT_COL, 2)); + assertRowRangesEquals(expected, calculateRowRanges(expr)); + + expected = RowRanges.union(createRowRanges(INT_COL, 0), createRowRanges(STR_COL, 7)); + expr = Expressions.or(equal(INT_COL, 1), equal(STR_COL, "Alfa")); + assertRowRangesEquals(expected, calculateRowRanges(expr)); + + expr = Expressions.or(equal(INT_COL, 2), equal(STR_COL, "Tango")); + expected = RowRanges.union(createRowRanges(INT_COL, 1), createRowRanges(STR_COL, 2)); + assertRowRangesEquals(expected, calculateRowRanges(expr)); + } + + @Test + public void testIntegerLt() { + RowRanges expected; + + expected = NO_ROWS; + assertRowRangesEquals(expected, calculateRowRanges(lessThan(INT_COL, 1))); + + expected = ALL_ROWS; + assertRowRangesEquals(expected, calculateRowRanges(lessThan(INT_COL, 27))); + + expected = createRowRanges(INT_COL, 0, 1); + assertRowRangesEquals(expected, calculateRowRanges(lessThan(INT_COL, 7))); + + expected = createRowRanges(INT_COL, 0, 1, 2, 3); + assertRowRangesEquals(expected, calculateRowRanges(lessThan(INT_COL, 10))); + } + + @Test + public void testIntegerLtEq() { + RowRanges expected; + + expected = NO_ROWS; + assertRowRangesEquals(expected, calculateRowRanges(lessThanOrEqual(INT_COL, 0))); + + expected = ALL_ROWS; + assertRowRangesEquals(expected, calculateRowRanges(lessThanOrEqual(INT_COL, 27))); + + expected = createRowRanges(INT_COL, 0, 1, 2, 3); + assertRowRangesEquals(expected, calculateRowRanges(lessThanOrEqual(INT_COL, 7))); + + expected = createRowRanges(INT_COL, 0, 1, 2, 3, 4); + assertRowRangesEquals(expected, calculateRowRanges(lessThanOrEqual(INT_COL, 11))); + + expected = createRowRanges(INT_COL, 0); + + assertRowRangesEquals(expected, calculateRowRanges(lessThanOrEqual(INT_COL, 1))); + } + + @Test + public void testIntegerGt() { + RowRanges expected; + + expected = NO_ROWS; + assertRowRangesEquals(expected, calculateRowRanges(greaterThan(INT_COL, 26))); + + expected = ALL_ROWS; + assertRowRangesEquals(expected, calculateRowRanges(greaterThan(INT_COL, 0))); + + expected = createRowRanges(INT_COL, 3, 4, 5, 6); + assertRowRangesEquals(expected, calculateRowRanges(greaterThan(INT_COL, 7))); + } + + @Test + public void testIntegerGtEq() { + RowRanges expected; + + expected = NO_ROWS; + assertRowRangesEquals(expected, calculateRowRanges(greaterThanOrEqual(INT_COL, 27))); + + expected = ALL_ROWS; + assertRowRangesEquals(expected, calculateRowRanges(greaterThanOrEqual(INT_COL, 1))); + + expected = createRowRanges(INT_COL, 2, 3, 4, 5, 6); + assertRowRangesEquals(expected, calculateRowRanges(greaterThanOrEqual(INT_COL, 7))); + } + + @Test + public void testIntegerEq() { + RowRanges expected; + + expected = NO_ROWS; + assertRowRangesEquals(expected, calculateRowRanges(equal(INT_COL, 0))); + + expected = createRowRanges(INT_COL, 2, 3); + assertRowRangesEquals(expected, calculateRowRanges(equal(INT_COL, 7))); + + expected = createRowRanges(INT_COL, 0); + assertRowRangesEquals(expected, calculateRowRanges(equal(INT_COL, 1))); + } + + @Test + public void testIntegerNotEq() { + RowRanges expected; + + expected = ALL_ROWS; + assertRowRangesEquals(expected, calculateRowRanges(notEqual(INT_COL, 0))); + + // TODO 如果是不会被截断的类型,可以用最大最小值做评估,跳过没有null值,且 min == max == value 的 pages + assertRowRangesEquals(expected, calculateRowRanges(notEqual(INT_COL, 7))); + } + + @Test + public void testCaseInsensitive() { + RowRanges expected; + + String intColAllCaps = INT_COL.toUpperCase(Locale.ROOT); + + expected = NO_ROWS; + assertRowRangesEquals(expected, calculateRowRanges(equal(intColAllCaps, 0), false)); + + expected = createRowRanges(INT_COL, 2, 3); + assertRowRangesEquals(expected, calculateRowRanges(equal(intColAllCaps, 7), false)); + + expected = createRowRanges(INT_COL, 0); + assertRowRangesEquals(expected, calculateRowRanges(equal(intColAllCaps, 1), false)); + } + + @Test + public void testStringStartsWith() { + RowRanges expected; + + expected = NO_ROWS; + assertRowRangesEquals(expected, calculateRowRanges(startsWith(STR_COL, "?"))); + + expected = createRowRanges(STR_COL, 0); + assertRowRangesEquals(expected, calculateRowRanges(startsWith(STR_COL, "Z"))); + } + + @Test + public void testStringNotStartsWith() { + RowRanges expected; + + expected = createRowRanges(STR_COL, 1, 2, 3, 4, 5, 6, 7); + assertRowRangesEquals(expected, calculateRowRanges(notStartsWith(STR_COL, "Z"))); + + expected = createRowRanges(STR_COL, 0, 1, 2, 3, 4, 5, 6); + assertRowRangesEquals(expected, calculateRowRanges(notStartsWith(STR_COL, "A"))); + } + + @Test + public void testIntegerIn() { + RowRanges expected; + Expression expr; + + expr = Expressions.in(INT_COL, 7, 13); + expected = createRowRanges(INT_COL, 2, 3, 4); + assertRowRangesEquals(expected, calculateRowRanges(expr)); + } + + @Test + public void testIntegerNotIn() { + RowRanges expected; + Expression expr; + + expr = Expressions.notIn(INT_COL, 7, 13); + expected = ALL_ROWS; + assertRowRangesEquals(expected, calculateRowRanges(expr)); + } + + @Test + public void testSomeNullsNotEq() { + RowRanges expected; + Expression expr; + + expr = Expressions.notEqual(STR_COL, "equal"); + expected = ALL_ROWS; + assertRowRangesEquals(expected, calculateRowRanges(expr)); + } + + @Test + public void testTypePromotion() { + RowRanges expected; + Schema promotedLong = new Schema(Types.NestedField.optional(1, INT_COL, Types.LongType.get())); + + expected = NO_ROWS; + assertRowRangesEquals(expected, calculateRowRanges(promotedLong, equal(INT_COL, 0), true)); + + expected = createRowRanges(INT_COL, 2, 3); + assertRowRangesEquals(expected, calculateRowRanges(promotedLong, equal(INT_COL, 7), true)); + } + + @Test + public void testMissingOffsetIndex() { + RowRanges expected; + + PrimitiveType missingOI = org.apache.parquet.schema.Types.primitive(INT32, Type.Repetition.REQUIRED) + .id(1) + .named("missing_oi"); + MessageType messageType = new MessageType("test", missingOI); + + expected = ALL_ROWS; + assertRowRangesEquals(expected, calculateRowRanges(SCHEMA, messageType, equal(INT_COL, 1), true)); + } + + private static class CIBuilder { + private static final ByteBuffer EMPTY = ByteBuffer.wrap(new byte[0]); + private final PrimitiveType type; + private final BoundaryOrder order; + boolean invalid = false; + private List nullPages = Lists.newArrayList(); + private List nullCounts = Lists.newArrayList(); + private List minValues = Lists.newArrayList(); + private List maxValues = Lists.newArrayList(); + + CIBuilder(PrimitiveType type, BoundaryOrder order) { + this.type = type; + this.order = order; + } + + CIBuilder addNullPage(long nullCount) { + nullPages.add(true); + nullCounts.add(nullCount); + minValues.add(EMPTY); + maxValues.add(EMPTY); + return this; + } + + CIBuilder addPage(long nullCount, int min, int max) { + nullPages.add(false); + nullCounts.add(nullCount); + minValues.add(ByteBuffer.wrap(BytesUtils.intToBytes(min))); + maxValues.add(ByteBuffer.wrap(BytesUtils.intToBytes(max))); + return this; + } + + CIBuilder addPage(long nullCount, String min, String max) { + nullPages.add(false); + nullCounts.add(nullCount); + minValues.add(ByteBuffer.wrap(min.getBytes(UTF_8))); + maxValues.add(ByteBuffer.wrap(max.getBytes(UTF_8))); + return this; + } + + CIBuilder addPage(long nullCount, double min, double max) { + if (Double.isNaN(min) || Double.isNaN(max)) { + invalid = true; + return this; + } + + nullPages.add(false); + nullCounts.add(nullCount); + minValues.add(ByteBuffer.wrap(BytesUtils.longToBytes(Double.doubleToLongBits(min)))); + maxValues.add(ByteBuffer.wrap(BytesUtils.longToBytes(Double.doubleToLongBits(max)))); + return this; + } + + ColumnIndex build() { + return invalid ? null : ColumnIndexBuilder.build(type, order, nullPages, nullCounts, minValues, maxValues); + } + } + + private static class OIBuilder { + private final OffsetIndexBuilder builder = OffsetIndexBuilder.getBuilder(); + + OIBuilder addPage(long rowCount) { + builder.add(1234, rowCount); + return this; + } + + OffsetIndex build() { + return builder.build(); + } + } +} From 4466e3dd630a3391e9d2588b237a72cee6bc68ed Mon Sep 17 00:00:00 2001 From: ZhongYujiang <42907416+zhongyujiang@users.noreply.github.com> Date: Wed, 15 Feb 2023 11:37:03 +0800 Subject: [PATCH 02/21] Fix predicate on decimal type columns and add tests. --- .../parquet/ParquetColumnIndexFilter.java | 17 +- .../parquet/TestColumnIndexFilter.java | 209 ++++++++++++++---- 2 files changed, 181 insertions(+), 45 deletions(-) diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java index c466d17cae17..38c2f5133f52 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java @@ -522,16 +522,17 @@ private Function conversion(int columnId) { if (conversion == null) { Type type = schema.findType(columnId); - Function bytesReorder; - if (type == Types.UUIDType.get() || type instanceof Types.DecimalType) { - // The buffers returned by Parquet are all in little-endian byte order, + conversion = buffer -> { + // The buffers returned by Parquet might be in little-endian byte order, // but Conversions#fromByteBuffer use big-endian byte order for UUIDs and Decimals. - bytesReorder = buffer -> toBigEndian(buffer); - } else { - bytesReorder = Function.identity(); - } + if ((type == Types.UUIDType.get() || type instanceof Types.DecimalType) && + buffer.order() == ByteOrder.LITTLE_ENDIAN) { + return Conversions.fromByteBuffer(type, toBigEndian(buffer)); + } else { + return Conversions.fromByteBuffer(type, buffer); + } + }; - conversion = bytesReorder.andThen(buffer -> Conversions.fromByteBuffer(type, buffer)); conversions.put(columnId, conversion); } diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java index 836ef87b19ad..70f7cde1d33e 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java @@ -19,7 +19,9 @@ package org.apache.iceberg.parquet; +import java.math.BigDecimal; import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.util.List; import java.util.Locale; import java.util.PrimitiveIterator; @@ -28,8 +30,9 @@ import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; -import org.apache.parquet.bytes.BytesUtils; +import org.apache.iceberg.util.DecimalUtil; import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.internal.column.columnindex.BoundaryOrder; import org.apache.parquet.internal.column.columnindex.ColumnIndex; @@ -64,6 +67,7 @@ import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; import static org.apache.parquet.schema.Types.optional; @@ -285,8 +289,12 @@ public OffsetIndex getOffsetIndex(ColumnPath column) { private static final RowRanges ALL_ROWS = RowRanges.createSingle(TOTAL_ROW_COUNT); private static final RowRanges NO_ROWS = RowRanges.EMPTY; - private static RowRanges createRowRanges(String path, Integer... pageIndexes) { - return RowRanges.create(TOTAL_ROW_COUNT, new PrimitiveIterator.OfInt() { + private static RowRanges selectRowRanges(String path, int... pageIndexes) { + return selectRowRanges(path, STORE, TOTAL_ROW_COUNT, pageIndexes); + } + + private static RowRanges selectRowRanges(String path, ColumnIndexStore store, long rowCount, int... pageIndexes) { + return RowRanges.create(rowCount, new PrimitiveIterator.OfInt() { int index = -1; @Override @@ -299,7 +307,7 @@ public boolean hasNext() { index += 1; return index < pageIndexes.length; } - }, STORE.getOffsetIndex(ColumnPath.fromDotString(path))); + }, store.getOffsetIndex(ColumnPath.fromDotString(path))); } private boolean rowRangesEquals(RowRanges r1, RowRanges r2) { @@ -350,21 +358,26 @@ private RowRanges calculateRowRanges(Schema schema, Expression expr, boolean cas } private RowRanges calculateRowRanges(Schema schema, MessageType messageType, Expression expr, boolean caseSensitive) { + return calculateRowRanges(schema, messageType, expr, caseSensitive, STORE, TOTAL_ROW_COUNT); + } + + private RowRanges calculateRowRanges(Schema schema, MessageType messageType, Expression expr, + boolean caseSensitive, ColumnIndexStore store, long rowCount) { return new ParquetColumnIndexFilter(schema, expr, caseSensitive) - .calculateRowRanges(messageType, STORE, TOTAL_ROW_COUNT); + .calculateRowRanges(messageType, store, rowCount); } @Test public void testIsNulls() { RowRanges expected; - expected = createRowRanges(INT_COL, 1, 3); + expected = selectRowRanges(INT_COL, 1, 3); assertRowRangesEquals(expected, calculateRowRanges(isNull(INT_COL))); - expected = createRowRanges(STR_COL, 2, 3); + expected = selectRowRanges(STR_COL, 2, 3); assertRowRangesEquals(expected, calculateRowRanges(isNull(STR_COL))); - expected = createRowRanges(NO_NANS, 2, 4, 6); + expected = selectRowRanges(NO_NANS, 2, 4, 6); assertRowRangesEquals(expected, calculateRowRanges(isNull(NO_NANS))); expected = ALL_ROWS; @@ -378,10 +391,10 @@ public void testNotNulls() { expected = ALL_ROWS; assertRowRangesEquals(expected, calculateRowRanges(notNull(INT_COL))); - expected = createRowRanges(STR_COL, 0, 1, 2, 4, 5, 6, 7); + expected = selectRowRanges(STR_COL, 0, 1, 2, 4, 5, 6, 7); assertRowRangesEquals(expected, calculateRowRanges(notNull(STR_COL))); - expected = createRowRanges(NO_NANS, 0, 1, 2, 3, 4, 5); + expected = selectRowRanges(NO_NANS, 0, 1, 2, 3, 4, 5); assertRowRangesEquals(expected, calculateRowRanges(notNull(NO_NANS))); expected = NO_ROWS; @@ -393,7 +406,7 @@ public void testIsNaN() { RowRanges expected; // column index exists, null page 6 should be filtered out - expected = createRowRanges(NO_NANS, 0, 1, 2, 3, 4, 5); + expected = selectRowRanges(NO_NANS, 0, 1, 2, 3, 4, 5); assertRowRangesEquals(expected, calculateRowRanges(isNaN(NO_NANS))); assertRowRangesEquals(ALL_ROWS, calculateRowRanges(isNaN(ALL_NANS))); @@ -443,7 +456,7 @@ public void testNot() { expected = ALL_ROWS; assertRowRangesEquals(expected, calculateRowRanges(not(lessThan(INT_COL, 1)))); - expected = createRowRanges(INT_COL, 1, 2, 3, 4, 5, 6); + expected = selectRowRanges(INT_COL, 1, 2, 3, 4, 5, 6); assertRowRangesEquals(expected, calculateRowRanges(not(lessThanOrEqual(INT_COL, 1)))); } @@ -460,7 +473,7 @@ public void testAnd() { assertRowRangesEquals(expected, calculateRowRanges(expr)); expr = Expressions.and(equal(INT_COL, 2), equal(STR_COL, "Tango")); - expected = RowRanges.intersection(createRowRanges(INT_COL, 1), createRowRanges(STR_COL, 2)); + expected = RowRanges.intersection(selectRowRanges(INT_COL, 1), selectRowRanges(STR_COL, 2)); assertRowRangesEquals(expected, calculateRowRanges(expr)); } @@ -469,16 +482,16 @@ public void testOr() { RowRanges expected; Expression expr; - expected = createRowRanges(INT_COL, 0, 1); + expected = selectRowRanges(INT_COL, 0, 1); expr = Expressions.or(equal(INT_COL, 1), equal(INT_COL, 2)); assertRowRangesEquals(expected, calculateRowRanges(expr)); - expected = RowRanges.union(createRowRanges(INT_COL, 0), createRowRanges(STR_COL, 7)); + expected = RowRanges.union(selectRowRanges(INT_COL, 0), selectRowRanges(STR_COL, 7)); expr = Expressions.or(equal(INT_COL, 1), equal(STR_COL, "Alfa")); assertRowRangesEquals(expected, calculateRowRanges(expr)); expr = Expressions.or(equal(INT_COL, 2), equal(STR_COL, "Tango")); - expected = RowRanges.union(createRowRanges(INT_COL, 1), createRowRanges(STR_COL, 2)); + expected = RowRanges.union(selectRowRanges(INT_COL, 1), selectRowRanges(STR_COL, 2)); assertRowRangesEquals(expected, calculateRowRanges(expr)); } @@ -492,10 +505,10 @@ public void testIntegerLt() { expected = ALL_ROWS; assertRowRangesEquals(expected, calculateRowRanges(lessThan(INT_COL, 27))); - expected = createRowRanges(INT_COL, 0, 1); + expected = selectRowRanges(INT_COL, 0, 1); assertRowRangesEquals(expected, calculateRowRanges(lessThan(INT_COL, 7))); - expected = createRowRanges(INT_COL, 0, 1, 2, 3); + expected = selectRowRanges(INT_COL, 0, 1, 2, 3); assertRowRangesEquals(expected, calculateRowRanges(lessThan(INT_COL, 10))); } @@ -509,13 +522,13 @@ public void testIntegerLtEq() { expected = ALL_ROWS; assertRowRangesEquals(expected, calculateRowRanges(lessThanOrEqual(INT_COL, 27))); - expected = createRowRanges(INT_COL, 0, 1, 2, 3); + expected = selectRowRanges(INT_COL, 0, 1, 2, 3); assertRowRangesEquals(expected, calculateRowRanges(lessThanOrEqual(INT_COL, 7))); - expected = createRowRanges(INT_COL, 0, 1, 2, 3, 4); + expected = selectRowRanges(INT_COL, 0, 1, 2, 3, 4); assertRowRangesEquals(expected, calculateRowRanges(lessThanOrEqual(INT_COL, 11))); - expected = createRowRanges(INT_COL, 0); + expected = selectRowRanges(INT_COL, 0); assertRowRangesEquals(expected, calculateRowRanges(lessThanOrEqual(INT_COL, 1))); } @@ -530,7 +543,7 @@ public void testIntegerGt() { expected = ALL_ROWS; assertRowRangesEquals(expected, calculateRowRanges(greaterThan(INT_COL, 0))); - expected = createRowRanges(INT_COL, 3, 4, 5, 6); + expected = selectRowRanges(INT_COL, 3, 4, 5, 6); assertRowRangesEquals(expected, calculateRowRanges(greaterThan(INT_COL, 7))); } @@ -544,7 +557,7 @@ public void testIntegerGtEq() { expected = ALL_ROWS; assertRowRangesEquals(expected, calculateRowRanges(greaterThanOrEqual(INT_COL, 1))); - expected = createRowRanges(INT_COL, 2, 3, 4, 5, 6); + expected = selectRowRanges(INT_COL, 2, 3, 4, 5, 6); assertRowRangesEquals(expected, calculateRowRanges(greaterThanOrEqual(INT_COL, 7))); } @@ -555,10 +568,10 @@ public void testIntegerEq() { expected = NO_ROWS; assertRowRangesEquals(expected, calculateRowRanges(equal(INT_COL, 0))); - expected = createRowRanges(INT_COL, 2, 3); + expected = selectRowRanges(INT_COL, 2, 3); assertRowRangesEquals(expected, calculateRowRanges(equal(INT_COL, 7))); - expected = createRowRanges(INT_COL, 0); + expected = selectRowRanges(INT_COL, 0); assertRowRangesEquals(expected, calculateRowRanges(equal(INT_COL, 1))); } @@ -569,7 +582,6 @@ public void testIntegerNotEq() { expected = ALL_ROWS; assertRowRangesEquals(expected, calculateRowRanges(notEqual(INT_COL, 0))); - // TODO 如果是不会被截断的类型,可以用最大最小值做评估,跳过没有null值,且 min == max == value 的 pages assertRowRangesEquals(expected, calculateRowRanges(notEqual(INT_COL, 7))); } @@ -582,10 +594,10 @@ public void testCaseInsensitive() { expected = NO_ROWS; assertRowRangesEquals(expected, calculateRowRanges(equal(intColAllCaps, 0), false)); - expected = createRowRanges(INT_COL, 2, 3); + expected = selectRowRanges(INT_COL, 2, 3); assertRowRangesEquals(expected, calculateRowRanges(equal(intColAllCaps, 7), false)); - expected = createRowRanges(INT_COL, 0); + expected = selectRowRanges(INT_COL, 0); assertRowRangesEquals(expected, calculateRowRanges(equal(intColAllCaps, 1), false)); } @@ -596,7 +608,7 @@ public void testStringStartsWith() { expected = NO_ROWS; assertRowRangesEquals(expected, calculateRowRanges(startsWith(STR_COL, "?"))); - expected = createRowRanges(STR_COL, 0); + expected = selectRowRanges(STR_COL, 0); assertRowRangesEquals(expected, calculateRowRanges(startsWith(STR_COL, "Z"))); } @@ -604,10 +616,10 @@ public void testStringStartsWith() { public void testStringNotStartsWith() { RowRanges expected; - expected = createRowRanges(STR_COL, 1, 2, 3, 4, 5, 6, 7); + expected = selectRowRanges(STR_COL, 1, 2, 3, 4, 5, 6, 7); assertRowRangesEquals(expected, calculateRowRanges(notStartsWith(STR_COL, "Z"))); - expected = createRowRanges(STR_COL, 0, 1, 2, 3, 4, 5, 6); + expected = selectRowRanges(STR_COL, 0, 1, 2, 3, 4, 5, 6); assertRowRangesEquals(expected, calculateRowRanges(notStartsWith(STR_COL, "A"))); } @@ -617,7 +629,7 @@ public void testIntegerIn() { Expression expr; expr = Expressions.in(INT_COL, 7, 13); - expected = createRowRanges(INT_COL, 2, 3, 4); + expected = selectRowRanges(INT_COL, 2, 3, 4); assertRowRangesEquals(expected, calculateRowRanges(expr)); } @@ -649,7 +661,7 @@ public void testTypePromotion() { expected = NO_ROWS; assertRowRangesEquals(expected, calculateRowRanges(promotedLong, equal(INT_COL, 0), true)); - expected = createRowRanges(INT_COL, 2, 3); + expected = selectRowRanges(INT_COL, 2, 3); assertRowRangesEquals(expected, calculateRowRanges(promotedLong, equal(INT_COL, 7), true)); } @@ -666,6 +678,109 @@ public void testMissingOffsetIndex() { assertRowRangesEquals(expected, calculateRowRanges(SCHEMA, messageType, equal(INT_COL, 1), true)); } + // 38 precision 10 scale decimal to bytes + private byte[] decimalToBytes(String decimalStr) { + BigDecimal decimal = new BigDecimal(decimalStr).setScale(10); + int requiredBytes = TypeUtil.decimalRequiredBytes(38); + byte[] bytes = new byte[requiredBytes]; + return DecimalUtil.toReusedFixLengthBytes(38, 10, decimal, bytes); + } + + @Test + public void testDecimalType() { + String intDecimal = "decimal_7_2"; + String longDecimal = "decimal_11_2"; + String binaryDecimal = "decimal_38_10"; + long rowCount = 9; + ColumnIndex intDecimalCI = new CIBuilder(optional(INT32).named(intDecimal), ASCENDING) + .addPage(0, 1234, 1235) + .addPage(1, 1235, 1235) + .addPage(2, 1237, 9999) + .build(); + + OffsetIndex intDecimalOI = new OIBuilder() + .addPage(2) + .addPage(3) + .addPage(4) + .build(); + + ColumnIndex binaryDecimalCI = new CIBuilder(optional(FIXED_LEN_BYTE_ARRAY) + .length(TypeUtil.decimalRequiredBytes(38)) + .named(binaryDecimal), ASCENDING) + .addPage(0, decimalToBytes("12.34"), decimalToBytes("12.35")) + .addPage(0, decimalToBytes("12.35"), decimalToBytes("12.39")) + .build(); + + OffsetIndex binaryDecimalOI = new OIBuilder() + .addPage(5) + .addPage(4) + .build(); + + ColumnIndexStore columnIndexStore = new ColumnIndexStore() { + @Override + public ColumnIndex getColumnIndex(ColumnPath columnPath) { + switch (columnPath.toDotString()) { + case "decimal_7_2": + return intDecimalCI; + case "decimal_38_10": + return binaryDecimalCI; + default: + return null; + } + } + + @Override + public OffsetIndex getOffsetIndex(ColumnPath columnPath) { + switch (columnPath.toDotString()) { + case "decimal_7_2": + return intDecimalOI; + case "decimal_38_10": + return binaryDecimalOI; + default: + throw new MissingOffsetIndexException(columnPath); + } + } + }; + + MessageType messageType = new MessageType("test", + org.apache.parquet.schema.Types.primitive(INT32, Type.Repetition.OPTIONAL) + .id(1) + .named("decimal_7_2"), + org.apache.parquet.schema.Types.primitive(FIXED_LEN_BYTE_ARRAY, Type.Repetition.OPTIONAL) + .length(TypeUtil.decimalRequiredBytes(38)) + .id(3) + .named("decimal_38_10")); + + Schema schema = new Schema(Types.NestedField.optional(1, intDecimal, Types.DecimalType.of(7, 2)), + Types.NestedField.optional(2, longDecimal, Types.DecimalType.of(11, 2)), + Types.NestedField.optional(3, binaryDecimal, Types.DecimalType.of(38, 10))); + + Expression expr = Expressions.and( + lessThan(intDecimal, new BigDecimal("12.37")), + greaterThanOrEqual(intDecimal, new BigDecimal("12.35")) + ); + + RowRanges expected = selectRowRanges(intDecimal, columnIndexStore, rowCount, 0, 1); + RowRanges actual = calculateRowRanges(schema, messageType, expr, true, columnIndexStore, rowCount); + + assertRowRangesEquals(expected, actual); + + expr = Expressions.and( + lessThan(binaryDecimal, new BigDecimal("12.37")), + greaterThanOrEqual(binaryDecimal, new BigDecimal("12.35")) + ); + + expected = selectRowRanges(binaryDecimal, columnIndexStore, rowCount, 0, 1); + actual = calculateRowRanges(schema, messageType, expr, true, columnIndexStore, rowCount); + assertRowRangesEquals(expected, actual); + + + expr = Expressions.greaterThan(binaryDecimal, new BigDecimal("99.99")); + expected = NO_ROWS; + actual = calculateRowRanges(schema, messageType, expr, true, columnIndexStore, rowCount); + assertRowRangesEquals(expected, actual); + } + private static class CIBuilder { private static final ByteBuffer EMPTY = ByteBuffer.wrap(new byte[0]); private final PrimitiveType type; @@ -689,11 +804,29 @@ CIBuilder addNullPage(long nullCount) { return this; } + CIBuilder addPage(long nullCount, byte[] min, byte[] max) { + nullPages.add(false); + nullCounts.add(nullCount); + minValues.add(ByteBuffer.wrap(min)); + maxValues.add(ByteBuffer.wrap(max)); + return this; + } + CIBuilder addPage(long nullCount, int min, int max) { nullPages.add(false); nullCounts.add(nullCount); - minValues.add(ByteBuffer.wrap(BytesUtils.intToBytes(min))); - maxValues.add(ByteBuffer.wrap(BytesUtils.intToBytes(max))); + minValues.add(ByteBuffer.allocate(Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN).putInt(0, min)); + maxValues.add(ByteBuffer.allocate(Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN).putInt(0, max)); +// minValues.add(ByteBuffer.wrap(BytesUtils.intToBytes(min))); +// maxValues.add(ByteBuffer.wrap(BytesUtils.intToBytes(max))); + return this; + } + + CIBuilder addPage(long nullCount, long min, long max) { + nullPages.add(false); + nullCounts.add(nullCount); + minValues.add(ByteBuffer.allocate(Long.BYTES).order(ByteOrder.LITTLE_ENDIAN).putLong(0, min)); + maxValues.add(ByteBuffer.allocate(Long.BYTES).order(ByteOrder.LITTLE_ENDIAN).putLong(0, max)); return this; } @@ -713,8 +846,10 @@ CIBuilder addPage(long nullCount, double min, double max) { nullPages.add(false); nullCounts.add(nullCount); - minValues.add(ByteBuffer.wrap(BytesUtils.longToBytes(Double.doubleToLongBits(min)))); - maxValues.add(ByteBuffer.wrap(BytesUtils.longToBytes(Double.doubleToLongBits(max)))); + minValues.add(ByteBuffer.allocate(Double.BYTES).order(ByteOrder.LITTLE_ENDIAN).putDouble(0, min)); + maxValues.add(ByteBuffer.allocate(Double.BYTES).order(ByteOrder.LITTLE_ENDIAN).putDouble(0, max)); +// minValues.add(ByteBuffer.wrap(BytesUtils.longToBytes(Double.doubleToLongBits(min)))); +// maxValues.add(ByteBuffer.wrap(BytesUtils.longToBytes(Double.doubleToLongBits(max)))); return this; } From 65a18863462bf11cb16d96b92c0483d64c4f8610 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Tue, 28 Feb 2023 16:09:02 +0800 Subject: [PATCH 03/21] Merge code from #6935. --- .../apache/iceberg/parquet/IndexIterator.java | 75 ----- .../iceberg/parquet/PageSkippingHelpers.java | 120 +++++++ .../parquet/ParquetColumnIndexFilter.java | 294 +++++++++++------- .../parquet/TestColumnIndexFilter.java | 20 +- 4 files changed, 318 insertions(+), 191 deletions(-) delete mode 100644 parquet/src/main/java/org/apache/iceberg/parquet/IndexIterator.java create mode 100644 parquet/src/main/java/org/apache/iceberg/parquet/PageSkippingHelpers.java diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/IndexIterator.java b/parquet/src/main/java/org/apache/iceberg/parquet/IndexIterator.java deleted file mode 100644 index a9fad04ace62..000000000000 --- a/parquet/src/main/java/org/apache/iceberg/parquet/IndexIterator.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.iceberg.parquet; - -import java.util.NoSuchElementException; -import java.util.PrimitiveIterator; -import java.util.function.IntPredicate; -import java.util.function.IntUnaryOperator; - -/** - * Iterator implementation for page indexes. - */ -class IndexIterator implements PrimitiveIterator.OfInt { - private final int endIndex; - private final IntPredicate filter; - private final IntUnaryOperator translator; - private int index; - - private IndexIterator(int startIndex, int endIndex, IntPredicate filter, IntUnaryOperator translator) { - this.endIndex = endIndex; - this.filter = filter; - this.translator = translator; - index = nextPageIndex(startIndex); - } - - static PrimitiveIterator.OfInt all(int pageCount) { - return new IndexIterator(0, pageCount, i -> true, i -> i); - } - - static PrimitiveIterator.OfInt filter(int pageCount, IntPredicate filter) { - return new IndexIterator(0, pageCount, filter, i -> i); - } - - private int nextPageIndex(int startIndex) { - for (int i = startIndex; i < endIndex; ++i) { - if (filter.test(i)) { - return i; - } - } - return -1; - } - - @Override - public boolean hasNext() { - return index >= 0; - } - - @Override - public int nextInt() { - if (hasNext()) { - int ret = index; - index = nextPageIndex(index + 1); - return translator.applyAsInt(ret); - } - throw new NoSuchElementException(); - } -} - diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/PageSkippingHelpers.java b/parquet/src/main/java/org/apache/iceberg/parquet/PageSkippingHelpers.java new file mode 100644 index 000000000000..36f1d5bf804c --- /dev/null +++ b/parquet/src/main/java/org/apache/iceberg/parquet/PageSkippingHelpers.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.parquet; + +import java.util.List; +import java.util.PrimitiveIterator; +import java.util.function.IntPredicate; +import org.apache.iceberg.common.DynConstructors; +import org.apache.iceberg.common.DynMethods; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.BlockMetaData; +import org.apache.parquet.internal.column.columnindex.OffsetIndex; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; +import org.apache.parquet.internal.filter2.columnindex.RowRanges; + +/** + * Helper methods for page skipping. + */ +class PageSkippingHelpers { + private PageSkippingHelpers() { + } + + private static final DynConstructors.Ctor RANGES_LIST_CTOR = DynConstructors.builder() + .hiddenImpl(RowRanges.class, List.class) + .build(); + + private static final RowRanges EMPTY = RANGES_LIST_CTOR.newInstance(ImmutableList.of()); + + static RowRanges empty() { + return EMPTY; + } + + private static final DynMethods.StaticMethod UNION = DynMethods.builder("union") + .hiddenImpl(RowRanges.class, RowRanges.class, RowRanges.class) + .buildStatic(); + + static RowRanges union(RowRanges left, RowRanges right) { + return UNION.invoke(left, right); + } + + private static final DynMethods.StaticMethod INTERSECTION = DynMethods.builder("intersection") + .hiddenImpl(RowRanges.class, RowRanges.class, RowRanges.class) + .buildStatic(); + + static RowRanges intersection(RowRanges left, RowRanges right) { + return INTERSECTION.invoke(left, right); + } + + private static final DynMethods.StaticMethod ROW_RANGES_CREATE = DynMethods.builder("create") + .hiddenImpl(RowRanges.class, long.class, PrimitiveIterator.OfInt.class, OffsetIndex.class) + .buildStatic(); + + static RowRanges createRowRanges(long rowCount, PrimitiveIterator.OfInt pageIndexes, OffsetIndex offsetIndex) { + return ROW_RANGES_CREATE.invoke(rowCount, pageIndexes, offsetIndex); + } + + private static final DynMethods.StaticMethod ROW_RANGES_CREATE_SINGLE = DynMethods.builder("createSingle") + .hiddenImpl(RowRanges.class, long.class) + .buildStatic(); + + static RowRanges allRows(long rowCount) { + return ROW_RANGES_CREATE_SINGLE.invoke(rowCount); + } + + private static final DynMethods.StaticMethod INDEX_ITERATOR_ALL = DynMethods.builder("all") + .hiddenImpl("org.apache.parquet.internal.column.columnindex.IndexIterator", int.class) + .buildStatic(); + + static PrimitiveIterator.OfInt allPageIndexes(int pageCount) { + return INDEX_ITERATOR_ALL.invoke(pageCount); + } + + private static final DynMethods.StaticMethod INDEX_ITERATOR_FILTER = DynMethods.builder("filter") + .hiddenImpl("org.apache.parquet.internal.column.columnindex.IndexIterator", int.class, IntPredicate.class) + .buildStatic(); + + static PrimitiveIterator.OfInt filterPageIndexes(int pageCount, IntPredicate filter) { + return INDEX_ITERATOR_FILTER.invoke(pageCount, filter); + } + + private static final DynMethods.UnboundMethod GET_COLUMN_INDEX_STORE = + DynMethods.builder("getColumnIndexStore") + .hiddenImpl("org.apache.parquet.hadoop.ParquetFileReader", int.class) + .build(); + + static ColumnIndexStore getColumnIndexStore(ParquetFileReader reader, int blockIndex) { + return GET_COLUMN_INDEX_STORE.invoke(reader, blockIndex); + } + + private static final DynMethods.UnboundMethod INTERNAL_READ_FILTERED_ROW_GROUP = + DynMethods.builder("internalReadFilteredRowGroup") + .hiddenImpl("org.apache.parquet.hadoop.ParquetFileReader", + BlockMetaData.class, RowRanges.class, ColumnIndexStore.class) + .build(); + + static PageReadStore internalReadFilteredRowGroup(ParquetFileReader reader, int blockIndex, RowRanges rowRanges) { + ColumnIndexStore columnIndexStore = GET_COLUMN_INDEX_STORE.invoke(reader, blockIndex); + BlockMetaData blockMetaData = reader.getRowGroups().get(blockIndex); + return INTERNAL_READ_FILTERED_ROW_GROUP.invoke(reader, blockMetaData, rowRanges, columnIndexStore); + } +} diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java index 38c2f5133f52..6dbc28d5b4e8 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java @@ -19,11 +19,14 @@ package org.apache.iceberg.parquet; +import java.math.BigDecimal; +import java.math.BigInteger; import java.nio.ByteBuffer; -import java.nio.ByteOrder; +import java.nio.charset.StandardCharsets; import java.util.Comparator; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.PrimitiveIterator; import java.util.Set; import java.util.function.Function; @@ -37,21 +40,28 @@ import org.apache.iceberg.expressions.Literal; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Comparators; -import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.Types; import org.apache.iceberg.util.BinaryUtil; +import org.apache.iceberg.util.ByteBuffers; import org.apache.iceberg.util.Pair; +import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.internal.column.columnindex.ColumnIndex; import org.apache.parquet.internal.column.columnindex.OffsetIndex; import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; import org.apache.parquet.internal.filter2.columnindex.RowRanges; -import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import static org.apache.iceberg.parquet.PageSkippingHelpers.allPageIndexes; +import static org.apache.iceberg.parquet.PageSkippingHelpers.allRows; +import static org.apache.iceberg.parquet.PageSkippingHelpers.filterPageIndexes; +import static org.apache.iceberg.parquet.PageSkippingHelpers.intersection; +import static org.apache.iceberg.parquet.PageSkippingHelpers.union; + public class ParquetColumnIndexFilter { private static final Logger LOG = LoggerFactory.getLogger(ParquetColumnIndexFilter.class); @@ -66,58 +76,62 @@ public ParquetColumnIndexFilter(Schema schema, Expression unbound, boolean caseS /** * Calculates the row ranges containing the indexes of the rows might match the expression. - * @param typeWithIds schema for the Parquet file with Iceberg type IDs + * @param fileSchema schema of file * @param columnIndexStore the store for providing column/offset indexes * @param rowCount the total number of rows in the row-group * @return the ranges of the possible matching row indexes; the returned ranges will contain all the rows * if any of the required offset index is missing */ - public RowRanges calculateRowRanges(MessageType typeWithIds, ColumnIndexStore columnIndexStore, long rowCount) { + public RowRanges calculateRowRanges(MessageType fileSchema, ColumnIndexStore columnIndexStore, long rowCount) { try { - return new ColumnIndexEvalVisitor(typeWithIds, columnIndexStore, rowCount).eval(); + return new ColumnIndexEvalVisitor(fileSchema, columnIndexStore, rowCount).eval(); } catch (ColumnIndexStore.MissingOffsetIndexException e) { LOG.info("Cannot get required offset index; Unable to filter on this row group", e); - return RowRanges.createSingle(rowCount); + return allRows(rowCount); } } private static final boolean ROWS_MIGHT_MATCH = true; private static final boolean ROWS_CANNOT_MATCH = false; - private static final RowRanges NO_ROWS = RowRanges.EMPTY; + private static final RowRanges NO_ROWS = PageSkippingHelpers.empty(); private class ColumnIndexEvalVisitor extends ExpressionVisitors.BoundExpressionVisitor { private final Map idToColumn = Maps.newHashMap(); - private final Map idToColumnIndex = Maps.newHashMap(); + private final Map idToColumnIndex = Maps.newHashMap(); private final Map idToOffsetIndex = Maps.newHashMap(); - private final Map> conversions = Maps.newHashMap(); + private final Map parquetTypes = Maps.newHashMap(); + private final Map icebergTypes = Maps.newHashMap(); private final RowRanges allRows; private final ColumnIndexStore columnIndexStore; private final long rowCount; - private ColumnIndexEvalVisitor(MessageType typeWithIds, ColumnIndexStore columnIndexStore, long rowCount) { - this.allRows = RowRanges.createSingle(rowCount); + private ColumnIndexEvalVisitor(MessageType fileSchema, ColumnIndexStore columnIndexStore, long rowCount) { + this.allRows = allRows(rowCount); this.columnIndexStore = columnIndexStore; this.rowCount = rowCount; - idByColumnPath(typeWithIds.asGroupType(), null, idToColumn); - } - private RowRanges eval() { - return ExpressionVisitors.visit(expr, this); - } + for (ColumnDescriptor desc : fileSchema.getColumns()) { + String[] path = desc.getPath(); + PrimitiveType colType = fileSchema.getType(path).asPrimitiveType(); + if (colType.getId() != null) { + int id = colType.getId().intValue(); + parquetTypes.put(id, colType); + Type type = schema.findType(id); + if (type != null) { + icebergTypes.put(id, type.asPrimitiveType()); + } - private void idByColumnPath(GroupType type, String parent, Map idToColumnPath) { - String prefix = parent == null ? "" : parent + "."; - for (org.apache.parquet.schema.Type field : type.getFields()) { - if (field.isPrimitive()) { - idToColumnPath.put(field.getId().intValue(), ColumnPath.fromDotString(prefix + field.getName())); - } else { - idByColumnPath(field.asGroupType(), prefix, idToColumnPath); + idToColumn.put(id, ColumnPath.get(path)); } } } + private RowRanges eval() { + return ExpressionVisitors.visit(expr, this); + } + @Override public RowRanges alwaysTrue() { return allRows; @@ -137,24 +151,24 @@ public RowRanges not(RowRanges result) { @Override public RowRanges and(RowRanges left, RowRanges right) { - return RowRanges.intersection(left, right); + return intersection(left, right); } @Override public RowRanges or(RowRanges left, RowRanges right) { - return RowRanges.union(left, right); + return union(left, right); } @Override public RowRanges isNull(BoundReference ref) { int id = ref.fieldId(); - Function func = columnIndex -> { + Function func = columnIndex -> { if (columnIndex.hasNullCounts()) { - return IndexIterator.filter(columnIndex.pageCount(), columnIndex::containsNull); + return filterPageIndexes(columnIndex.pageCount(), columnIndex::containsNull); } else { // Searching for nulls so if we don't have null related statistics we have to return all pages - return IndexIterator.all(columnIndex.pageCount()); + return allPageIndexes(columnIndex.pageCount()); } }; @@ -172,8 +186,8 @@ public RowRanges notNull(BoundReference ref) { return allRows; } - Function func = - columnIndex -> IndexIterator.filter(columnIndex.pageCount(), columnIndex::isNonNullPage); + Function func = + columnIndex -> filterPageIndexes(columnIndex.pageCount(), columnIndex::isNonNullPage); return applyPredicate(id, func, ROWS_CANNOT_MATCH); } @@ -182,8 +196,8 @@ public RowRanges notNull(BoundReference ref) { public RowRanges isNaN(BoundReference ref) { int id = ref.fieldId(); - Function func = - columnIndex -> IndexIterator.filter(columnIndex.pageCount(), columnIndex::isNonNullPage); + Function func = + columnIndex -> filterPageIndexes(columnIndex.pageCount(), columnIndex::isNonNullPage); return applyPredicate(id, func, ROWS_CANNOT_MATCH); } @@ -198,14 +212,14 @@ public RowRanges notNaN(BoundReference ref) { public RowRanges lt(BoundReference ref, Literal lit) { int id = ref.fieldId(); - Function func = columnIndex -> { + Function func = columnIndex -> { IntPredicate filter = pageIndex -> { if (columnIndex.isNullPage(pageIndex)) { return ROWS_CANNOT_MATCH; } - T lower = (T) columnIndex.min(pageIndex); + T lower = columnIndex.min(pageIndex); if (lit.comparator().compare(lower, lit.value()) >= 0) { return ROWS_CANNOT_MATCH; } @@ -213,7 +227,7 @@ public RowRanges lt(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; }; - return IndexIterator.filter(columnIndex.pageCount(), filter); + return filterPageIndexes(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); @@ -223,14 +237,14 @@ public RowRanges lt(BoundReference ref, Literal lit) { public RowRanges ltEq(BoundReference ref, Literal lit) { int id = ref.fieldId(); - Function func = columnIndex -> { + Function func = columnIndex -> { IntPredicate filter = pageIndex -> { if (columnIndex.isNullPage(pageIndex)) { return ROWS_CANNOT_MATCH; } - T lower = (T) columnIndex.min(pageIndex); + T lower = columnIndex.min(pageIndex); if (lit.comparator().compare(lower, lit.value()) > 0) { return ROWS_CANNOT_MATCH; } @@ -238,7 +252,7 @@ public RowRanges ltEq(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; }; - return IndexIterator.filter(columnIndex.pageCount(), filter); + return filterPageIndexes(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); @@ -248,21 +262,21 @@ public RowRanges ltEq(BoundReference ref, Literal lit) { public RowRanges gt(BoundReference ref, Literal lit) { int id = ref.fieldId(); - Function func = columnIndex -> { + Function func = columnIndex -> { IntPredicate filter = pageIndex -> { if (columnIndex.isNullPage(pageIndex)) { return ROWS_CANNOT_MATCH; } - T upper = (T) columnIndex.max(pageIndex); + T upper = columnIndex.max(pageIndex); if (lit.comparator().compare(upper, lit.value()) <= 0) { return ROWS_CANNOT_MATCH; } return ROWS_MIGHT_MATCH; }; - return IndexIterator.filter(columnIndex.pageCount(), filter); + return filterPageIndexes(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); @@ -272,21 +286,21 @@ public RowRanges gt(BoundReference ref, Literal lit) { public RowRanges gtEq(BoundReference ref, Literal lit) { int id = ref.fieldId(); - Function func = columnIndex -> { + Function func = columnIndex -> { IntPredicate filter = pageIndex -> { if (columnIndex.isNullPage(pageIndex)) { return ROWS_CANNOT_MATCH; } - T upper = (T) columnIndex.max(pageIndex); + T upper = columnIndex.max(pageIndex); if (lit.comparator().compare(upper, lit.value()) < 0) { return ROWS_CANNOT_MATCH; } return ROWS_MIGHT_MATCH; }; - return IndexIterator.filter(columnIndex.pageCount(), filter); + return filterPageIndexes(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); @@ -296,19 +310,19 @@ public RowRanges gtEq(BoundReference ref, Literal lit) { public RowRanges eq(BoundReference ref, Literal lit) { int id = ref.fieldId(); - Function func = columnIndex -> { + Function func = columnIndex -> { IntPredicate filter = pageIndex -> { if (columnIndex.isNullPage(pageIndex)) { return ROWS_CANNOT_MATCH; } - T lower = (T) columnIndex.min(pageIndex); + T lower = columnIndex.min(pageIndex); if (lit.comparator().compare(lower, lit.value()) > 0) { return ROWS_CANNOT_MATCH; } - T upper = (T) columnIndex.max(pageIndex); + T upper = columnIndex.max(pageIndex); if (lit.comparator().compare(upper, lit.value()) < 0) { return ROWS_CANNOT_MATCH; } @@ -316,7 +330,7 @@ public RowRanges eq(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; }; - return IndexIterator.filter(columnIndex.pageCount(), filter); + return filterPageIndexes(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); @@ -332,19 +346,19 @@ public RowRanges in(BoundReference ref, Set literalSet) { int id = ref.fieldId(); Pair minMax = minMax(ref.comparator(), literalSet); - Function func = columnIndex -> { + Function func = columnIndex -> { IntPredicate filter = pageIndex -> { if (columnIndex.isNullPage(pageIndex)) { return ROWS_CANNOT_MATCH; } - T lower = (T) columnIndex.min(pageIndex); + T lower = columnIndex.min(pageIndex); if (ref.comparator().compare(lower, minMax.second()) > 0) { return ROWS_CANNOT_MATCH; } - T upper = (T) columnIndex.max(pageIndex); + T upper = columnIndex.max(pageIndex); if (ref.comparator().compare(upper, minMax.first()) < 0) { return ROWS_CANNOT_MATCH; } @@ -352,7 +366,7 @@ public RowRanges in(BoundReference ref, Set literalSet) { return ROWS_MIGHT_MATCH; }; - return IndexIterator.filter(columnIndex.pageCount(), filter); + return filterPageIndexes(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); @@ -387,7 +401,7 @@ public RowRanges notIn(BoundReference ref, Set literalSet) { public RowRanges startsWith(BoundReference ref, Literal lit) { int id = ref.fieldId(); - Function func = columnIndex -> { + Function func = columnIndex -> { ByteBuffer prefixAsBytes = lit.toByteBuffer(); Comparator comparator = Comparators.unsignedBytes(); @@ -417,7 +431,7 @@ public RowRanges startsWith(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; }; - return IndexIterator.filter(columnIndex.pageCount(), filter); + return filterPageIndexes(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); @@ -427,7 +441,7 @@ public RowRanges startsWith(BoundReference ref, Literal lit) { public RowRanges notStartsWith(BoundReference ref, Literal lit) { int id = ref.fieldId(); - Function func = columnIndex -> { + Function func = columnIndex -> { IntPredicate filter; if (columnIndex.hasNullCounts()) { ByteBuffer prefixAsBytes = lit.toByteBuffer(); @@ -470,30 +484,32 @@ public RowRanges notStartsWith(BoundReference ref, Literal lit) { filter = pageIndex -> ROWS_MIGHT_MATCH; } - return IndexIterator.filter(columnIndex.pageCount(), filter); + return filterPageIndexes(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_MIGHT_MATCH); } private RowRanges applyPredicate(int columnId, - Function func, + Function func, boolean missingColumnMightMatch) { if (!idToColumn.containsKey(columnId)) { return missingColumnMightMatch ? allRows : NO_ROWS; } - // Get the offset index first so that the MissingOffsetIndexException (if any) is thrown ASAP + // If the column index of a column is not available, we cannot filter on this column. + // If the offset index of a column is not available, we cannot filter on this row group. + // Get the offset index first so that the MissingOffsetIndexException (if any) is thrown ASAP. OffsetIndex offsetIndex = offsetIndex(columnId); - ColumnIndexWrapper columnIndex = columnIndex(columnId); + ParquetColumnIndex columnIndex = columnIndex(columnId); if (columnIndex == null) { LOG.info("No column index for column {} is available; Unable to filter on this column", idToColumn.get(columnId)); return allRows; } - return RowRanges.create(rowCount, func.apply(columnIndex), offsetIndex); + return PageSkippingHelpers.createRowRanges(rowCount, func.apply(columnIndex), offsetIndex); } // Assumes that the column corresponding to the id exists in the file. @@ -502,69 +518,38 @@ private OffsetIndex offsetIndex(int columnId) { } // Assumes that the column corresponding to the id exists in the file. - private ColumnIndexWrapper columnIndex(int columnId) { - ColumnIndexWrapper wrapper = idToColumnIndex.get(columnId); + private ParquetColumnIndex columnIndex(int columnId) { + ParquetColumnIndex wrapper = idToColumnIndex.get(columnId); if (wrapper == null) { ColumnIndex columnIndex = columnIndexStore.getColumnIndex(idToColumn.get(columnId)); if (columnIndex != null) { - wrapper = new ColumnIndexWrapper(columnIndex, conversion(columnId)); + wrapper = new ParquetColumnIndex(columnIndex, parquetTypes.get(columnId), icebergTypes.get(columnId)); idToColumnIndex.put(columnId, wrapper); } } return wrapper; } - - // Assumes that the field corresponding to the id exists in the Iceberg schema. - private Function conversion(int columnId) { - Function conversion = conversions.get(columnId); - - if (conversion == null) { - Type type = schema.findType(columnId); - conversion = buffer -> { - // The buffers returned by Parquet might be in little-endian byte order, - // but Conversions#fromByteBuffer use big-endian byte order for UUIDs and Decimals. - if ((type == Types.UUIDType.get() || type instanceof Types.DecimalType) && - buffer.order() == ByteOrder.LITTLE_ENDIAN) { - return Conversions.fromByteBuffer(type, toBigEndian(buffer)); - } else { - return Conversions.fromByteBuffer(type, buffer); - } - }; - - conversions.put(columnId, conversion); - } - - return conversion; - } - } - - private static ByteBuffer toBigEndian(ByteBuffer buffer) { - int size = buffer.remaining(); - ByteBuffer bigEndian = ByteBuffer.allocate(size).order(ByteOrder.BIG_ENDIAN); - for (int i = 0; i < size; i += 1) { - bigEndian.put(i, buffer.get(size - 1 - i)); - } - - return bigEndian; } /** * A wrapper for ColumnIndex, which will cache statistics data and convert min max buffers to Iceberg type values. */ - private static class ColumnIndexWrapper { + private static class ParquetColumnIndex { private final ColumnIndex columnIndex; - private final Function conversion; + private final PrimitiveType primitiveType; + private final Type.PrimitiveType icebergType; private List nullPages; private List minBuffers; private List maxBuffers; private List nullCounts; // optional field - private ColumnIndexWrapper(ColumnIndex columnIndex, Function conversion) { + private ParquetColumnIndex(ColumnIndex columnIndex, PrimitiveType primitiveType, Type.PrimitiveType icebergType) { this.columnIndex = columnIndex; - this.conversion = conversion; + this.primitiveType = primitiveType; + this.icebergType = icebergType; } private ByteBuffer minBuffer(int pageIndex) { @@ -591,12 +576,12 @@ private List nullPages() { return nullPages; } - private Object min(int pageIndex) { - return conversion.apply(minBuffer(pageIndex)); + private T min(int pageIndex) { + return fromBytes(minBuffer(pageIndex), primitiveType, icebergType); } - private Object max(int pageIndex) { - return conversion.apply(maxBuffer(pageIndex)); + private T max(int pageIndex) { + return fromBytes(maxBuffer(pageIndex), primitiveType, icebergType); } private Boolean isNullPage(int pageIndex) { @@ -626,5 +611,102 @@ private boolean containsNull(int pageIndex) { private int pageCount() { return nullPages().size(); } + + @SuppressWarnings("unchecked") + private T fromBytes(ByteBuffer bytes, PrimitiveType primitiveType, Type.PrimitiveType icebergType) { + LogicalTypeAnnotation logicalTypeAnnotation = primitiveType.getLogicalTypeAnnotation(); + Optional converted = logicalTypeAnnotation == null ? Optional.empty() : logicalTypeAnnotation + .accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { + @Override + public Optional visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) { + return Optional.of(StandardCharsets.UTF_8.decode(bytes)); + } + + @Override + public Optional visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) { + return Optional.of(StandardCharsets.UTF_8.decode(bytes)); + } + + @Override + public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalType) { + switch (primitiveType.getPrimitiveTypeName()) { + case INT32: + return Optional.of(new BigDecimal(BigInteger.valueOf(bytes.getInt(0)), decimalType.getScale())); + case INT64: + return Optional.of(new BigDecimal(BigInteger.valueOf(bytes.getLong(0)), decimalType.getScale())); + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + new BigDecimal(new BigInteger(ByteBuffers.toByteArray(bytes)), decimalType.getScale()); + } + return Optional.empty(); + } + + @Override + public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { + switch (timeLogicalType.getUnit()) { + case MILLIS: + return Optional.of(((long) bytes.getInt(0)) * 1000L); + case MICROS: + return Optional.of(bytes.getLong(0)); + case NANOS: + return Optional.of(Math.floorDiv(bytes.getLong(0), 1000)); + } + return Optional.empty(); + } + + @Override + public Optional visit(LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { + switch (timestampLogicalType.getUnit()) { + case MILLIS: + return Optional.of(bytes.getLong(0) * 1000); + case MICROS: + return Optional.of(bytes.getLong(0)); + case NANOS: + return Optional.of(Math.floorDiv(bytes.getLong(0), 1000)); + } + return Optional.empty(); + } + + @Override + public Optional visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) { + return Optional.of(StandardCharsets.UTF_8.decode(bytes)); + } + + @Override + public Optional visit(LogicalTypeAnnotation.UUIDLogicalTypeAnnotation uuidLogicalType) { + return LogicalTypeAnnotation.LogicalTypeAnnotationVisitor.super.visit(uuidLogicalType); + } + }); + + if (converted.isPresent()) { + return (T) converted.get(); + } + + switch (primitiveType.getPrimitiveTypeName()) { + case BOOLEAN: + return (T) (Boolean) (bytes.get() != 0); + case INT32: + Integer intValue = bytes.getInt(0); + if (icebergType.typeId() == Type.TypeID.LONG) { + return (T) (Long) intValue.longValue(); + } + return (T) intValue; + case INT64: + return (T) (Long) bytes.getLong(0); + case FLOAT: + Float floatValue = bytes.getFloat(0); + if (icebergType.typeId() == Type.TypeID.DOUBLE) { + return (T) (Double) floatValue.doubleValue(); + } + return (T) floatValue; + case DOUBLE: + return (T) (Double) bytes.getDouble(0); + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + return (T) bytes; + default: + throw new UnsupportedOperationException("Unsupported Parquet type: " + primitiveType); + } + } } } diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java index 70f7cde1d33e..c61b6f49e8bd 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java @@ -33,6 +33,7 @@ import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.DecimalUtil; +import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.internal.column.columnindex.BoundaryOrder; import org.apache.parquet.internal.column.columnindex.ColumnIndex; @@ -286,15 +287,15 @@ public OffsetIndex getOffsetIndex(ColumnPath column) { /** END **/ private static final MessageType FILE_SCHEMA = ParquetSchemaUtil.convert(SCHEMA_MISSING_COLUMN, "table"); - private static final RowRanges ALL_ROWS = RowRanges.createSingle(TOTAL_ROW_COUNT); - private static final RowRanges NO_ROWS = RowRanges.EMPTY; + private static final RowRanges ALL_ROWS = PageSkippingHelpers.allRows(TOTAL_ROW_COUNT); + private static final RowRanges NO_ROWS = PageSkippingHelpers.empty(); private static RowRanges selectRowRanges(String path, int... pageIndexes) { return selectRowRanges(path, STORE, TOTAL_ROW_COUNT, pageIndexes); } private static RowRanges selectRowRanges(String path, ColumnIndexStore store, long rowCount, int... pageIndexes) { - return RowRanges.create(rowCount, new PrimitiveIterator.OfInt() { + return PageSkippingHelpers.createRowRanges(rowCount, new PrimitiveIterator.OfInt() { int index = -1; @Override @@ -465,6 +466,9 @@ public void testAnd() { RowRanges expected; Expression expr; + List columns = FILE_SCHEMA.getColumns(); + columns.forEach(System.out::println); + expected = NO_ROWS; expr = Expressions.and(equal(INT_COL, 1), equal(INT_COL, 2)); assertRowRangesEquals(expected, calculateRowRanges(expr)); @@ -473,7 +477,7 @@ public void testAnd() { assertRowRangesEquals(expected, calculateRowRanges(expr)); expr = Expressions.and(equal(INT_COL, 2), equal(STR_COL, "Tango")); - expected = RowRanges.intersection(selectRowRanges(INT_COL, 1), selectRowRanges(STR_COL, 2)); + expected = PageSkippingHelpers.intersection(selectRowRanges(INT_COL, 1), selectRowRanges(STR_COL, 2)); assertRowRangesEquals(expected, calculateRowRanges(expr)); } @@ -486,12 +490,12 @@ public void testOr() { expr = Expressions.or(equal(INT_COL, 1), equal(INT_COL, 2)); assertRowRangesEquals(expected, calculateRowRanges(expr)); - expected = RowRanges.union(selectRowRanges(INT_COL, 0), selectRowRanges(STR_COL, 7)); + expected = PageSkippingHelpers.union(selectRowRanges(INT_COL, 0), selectRowRanges(STR_COL, 7)); expr = Expressions.or(equal(INT_COL, 1), equal(STR_COL, "Alfa")); assertRowRangesEquals(expected, calculateRowRanges(expr)); expr = Expressions.or(equal(INT_COL, 2), equal(STR_COL, "Tango")); - expected = RowRanges.union(selectRowRanges(INT_COL, 1), selectRowRanges(STR_COL, 2)); + expected = PageSkippingHelpers.union(selectRowRanges(INT_COL, 1), selectRowRanges(STR_COL, 2)); assertRowRangesEquals(expected, calculateRowRanges(expr)); } @@ -817,8 +821,6 @@ CIBuilder addPage(long nullCount, int min, int max) { nullCounts.add(nullCount); minValues.add(ByteBuffer.allocate(Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN).putInt(0, min)); maxValues.add(ByteBuffer.allocate(Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN).putInt(0, max)); -// minValues.add(ByteBuffer.wrap(BytesUtils.intToBytes(min))); -// maxValues.add(ByteBuffer.wrap(BytesUtils.intToBytes(max))); return this; } @@ -848,8 +850,6 @@ CIBuilder addPage(long nullCount, double min, double max) { nullCounts.add(nullCount); minValues.add(ByteBuffer.allocate(Double.BYTES).order(ByteOrder.LITTLE_ENDIAN).putDouble(0, min)); maxValues.add(ByteBuffer.allocate(Double.BYTES).order(ByteOrder.LITTLE_ENDIAN).putDouble(0, max)); -// minValues.add(ByteBuffer.wrap(BytesUtils.longToBytes(Double.doubleToLongBits(min)))); -// maxValues.add(ByteBuffer.wrap(BytesUtils.longToBytes(Double.doubleToLongBits(max)))); return this; } From f6f84653a063bdcc3ed67c42f8ad9b462f6093b1 Mon Sep 17 00:00:00 2001 From: ZhongYujiang <42907416+zhongyujiang@users.noreply.github.com> Date: Tue, 28 Feb 2023 20:47:55 +0800 Subject: [PATCH 04/21] Refactor tests. --- .../parquet/ParquetColumnIndexFilter.java | 2 +- .../parquet/TestColumnIndexFilter.java | 484 ++++++++++-------- 2 files changed, 258 insertions(+), 228 deletions(-) diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java index 6dbc28d5b4e8..6f0ad1553694 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java @@ -636,7 +636,7 @@ public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation return Optional.of(new BigDecimal(BigInteger.valueOf(bytes.getLong(0)), decimalType.getScale())); case BINARY: case FIXED_LEN_BYTE_ARRAY: - new BigDecimal(new BigInteger(ByteBuffers.toByteArray(bytes)), decimalType.getScale()); + return Optional.of(new BigDecimal(new BigInteger(ByteBuffers.toByteArray(bytes)), decimalType.getScale())); } return Optional.empty(); } diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java index c61b6f49e8bd..7b3c3969b5cd 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java @@ -42,6 +42,7 @@ import org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder; import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; import org.apache.parquet.internal.filter2.columnindex.RowRanges; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Type; @@ -49,18 +50,22 @@ import org.junit.Test; import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.iceberg.expressions.Expressions.and; import static org.apache.iceberg.expressions.Expressions.equal; import static org.apache.iceberg.expressions.Expressions.greaterThan; import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.in; import static org.apache.iceberg.expressions.Expressions.isNaN; import static org.apache.iceberg.expressions.Expressions.isNull; import static org.apache.iceberg.expressions.Expressions.lessThan; import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; import static org.apache.iceberg.expressions.Expressions.not; import static org.apache.iceberg.expressions.Expressions.notEqual; +import static org.apache.iceberg.expressions.Expressions.notIn; import static org.apache.iceberg.expressions.Expressions.notNaN; import static org.apache.iceberg.expressions.Expressions.notNull; import static org.apache.iceberg.expressions.Expressions.notStartsWith; +import static org.apache.iceberg.expressions.Expressions.or; import static org.apache.iceberg.expressions.Expressions.startsWith; import static org.apache.parquet.internal.column.columnindex.BoundaryOrder.ASCENDING; import static org.apache.parquet.internal.column.columnindex.BoundaryOrder.DESCENDING; @@ -77,14 +82,120 @@ public class TestColumnIndexFilter { /** * COPIED FROM org.apache.parquet.internal.filter2.columnindex.TestColumnIndexFilter **/ + private static class CIBuilder { + private static final ByteBuffer EMPTY = ByteBuffer.wrap(new byte[0]); + private final PrimitiveType type; + private final BoundaryOrder order; + boolean invalid = false; + private List nullPages = Lists.newArrayList(); + private List nullCounts = Lists.newArrayList(); + private List minValues = Lists.newArrayList(); + private List maxValues = Lists.newArrayList(); + + CIBuilder(PrimitiveType type, BoundaryOrder order) { + this.type = type; + this.order = order; + } + + CIBuilder addNullPage(long nullCount) { + nullPages.add(true); + nullCounts.add(nullCount); + minValues.add(EMPTY); + maxValues.add(EMPTY); + return this; + } + + CIBuilder addPage(long nullCount, byte[] min, byte[] max) { + nullPages.add(false); + nullCounts.add(nullCount); + minValues.add(ByteBuffer.wrap(min)); + maxValues.add(ByteBuffer.wrap(max)); + return this; + } + + CIBuilder addPage(long nullCount, int min, int max) { + nullPages.add(false); + nullCounts.add(nullCount); + minValues.add(ByteBuffer.allocate(Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN).putInt(0, min)); + maxValues.add(ByteBuffer.allocate(Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN).putInt(0, max)); + return this; + } + + CIBuilder addPage(long nullCount, long min, long max) { + nullPages.add(false); + nullCounts.add(nullCount); + minValues.add(ByteBuffer.allocate(Long.BYTES).order(ByteOrder.LITTLE_ENDIAN).putLong(0, min)); + maxValues.add(ByteBuffer.allocate(Long.BYTES).order(ByteOrder.LITTLE_ENDIAN).putLong(0, max)); + return this; + } + + CIBuilder addPage(long nullCount, String min, String max) { + nullPages.add(false); + nullCounts.add(nullCount); + minValues.add(ByteBuffer.wrap(min.getBytes(UTF_8))); + maxValues.add(ByteBuffer.wrap(max.getBytes(UTF_8))); + return this; + } + + CIBuilder addPage(long nullCount, double min, double max) { + if (Double.isNaN(min) || Double.isNaN(max)) { + invalid = true; + return this; + } + + nullPages.add(false); + nullCounts.add(nullCount); + minValues.add(ByteBuffer.allocate(Double.BYTES).order(ByteOrder.LITTLE_ENDIAN).putDouble(0, min)); + maxValues.add(ByteBuffer.allocate(Double.BYTES).order(ByteOrder.LITTLE_ENDIAN).putDouble(0, max)); + return this; + } + + ColumnIndex build() { + return invalid ? null : ColumnIndexBuilder.build(type, order, nullPages, nullCounts, minValues, maxValues); + } + } + + private static class OIBuilder { + private final OffsetIndexBuilder builder = OffsetIndexBuilder.getBuilder(); + + OIBuilder addPage(long rowCount) { + builder.add(1234, rowCount); + return this; + } + + OffsetIndex build() { + return builder.build(); + } + } + private static final long TOTAL_ROW_COUNT = 30; private static final String INT_COL = "int_col"; private static final String STR_COL = "str_col"; private static final String NO_NANS = "no_nans"; private static final String NO_CI = "no_ci"; private static final String ALL_NULLS = "all_nulls"; - private static final String ALL_NANS = "all_nans"; + private static final String INT_DECIMAL_7_2 = "int_decimal_7_2"; private static final String NOT_IN_FILE = "not_in_file"; + + private static final Schema SCHEMA = new Schema( + Types.NestedField.optional(1, INT_COL, Types.IntegerType.get()), + Types.NestedField.optional(2, STR_COL, Types.StringType.get()), + Types.NestedField.optional(3, NO_NANS, Types.DoubleType.get()), + Types.NestedField.optional(4, NO_CI, Types.DoubleType.get()), + Types.NestedField.optional(5, ALL_NULLS, Types.LongType.get()), + Types.NestedField.optional(6, INT_DECIMAL_7_2, Types.DecimalType.of(7, 2)), + Types.NestedField.optional(7, NOT_IN_FILE, Types.LongType.get()) + ); + + private static final MessageType FILE_SCHEMA = org.apache.parquet.schema.Types.buildMessage() + .addField(optional(INT32).id(1).named(INT_COL)) + .addField(optional(BINARY).id(2).as(LogicalTypeAnnotation.stringType()).id(2).named(STR_COL)) + .addField(optional(DOUBLE).id(3).named(NO_NANS)) + .addField(optional(DOUBLE).id(4).named(NO_CI)) + .addField(optional(INT64).id(5).named(ALL_NULLS)) + .addField(optional(INT32).id(6).as(LogicalTypeAnnotation.decimalType(2, 9)).named(INT_DECIMAL_7_2)) + .named("table"); + private static final ColumnIndex INT_COL_CI = new CIBuilder(optional(INT32).named(INT_COL), ASCENDING) .addPage(0, 1, 1) .addPage(1, 2, 6) @@ -162,9 +273,27 @@ public class TestColumnIndexFilter { .addPage(1) .addPage(29) .build(); - private static final ColumnIndex ALL_NANS_CI = new CIBuilder(optional(DOUBLE).named(ALL_NANS), UNORDERED) - .addPage(1, Double.NaN, Double.NaN) - .addPage(29, Double.NaN, Double.NaN) + private static final ColumnIndex INT_DECIMAL_7_2_CI = new CIBuilder(optional(INT32).named(INT_DECIMAL_7_2), UNORDERED) + .addPage(0, 99, 99) + .addPage(0, 100, 100) + .addPage(0, 101, 101) + .addPage(0, 98, 98) + .addPage(0, 99, 103) + .addNullPage(4) + .addPage(0, 100, 100) + .addPage(2, 87, 109) + .addNullPage(2) + .build(); + private static final OffsetIndex INT_DECIMAL_7_2_OI = new OIBuilder() + .addPage(1) + .addPage(3) + .addPage(2) + .addPage(1) + .addPage(5) + .addPage(4) + .addPage(5) + .addPage(7) + .addPage(2) .build(); private static final ColumnIndexStore STORE = new ColumnIndexStore() { @Override @@ -180,8 +309,8 @@ public ColumnIndex getColumnIndex(ColumnPath column) { return NO_CI_CI; case ALL_NULLS: return ALL_NULLS_CI; - case ALL_NANS: - return ALL_NANS_CI; + case INT_DECIMAL_7_2: + return INT_DECIMAL_7_2_CI; default: return null; } @@ -200,93 +329,69 @@ public OffsetIndex getOffsetIndex(ColumnPath column) { return NO_CI_OI; case ALL_NULLS: return ALL_NULLS_OI; - case ALL_NANS: - return ALL_NANS_OI; + case INT_DECIMAL_7_2: + return INT_DECIMAL_7_2_OI; default: throw new MissingOffsetIndexException(column); } } }; - private static final OffsetIndex ALL_NANS_OI = new OIBuilder() - .addPage(1) - .addPage(29) - .build(); + /** *
-   * row   int_col       str_col        no_nans        no_ci          all_nulls      all_nans
+   * row   int_col       str_col        no_nans        no_ci          all_nulls      int_decimal_7_2
    *                                                 (no column index)
    *      ------0------  ------0------  ------0------  ------0------  ------0------  ------0------
-   * 0.   1              Zulu           2.03                          null           NaN
+   * 0.   1              Zulu           2.03                          null           99
    *      ------1------  ------1------  ------1------  ------1------  ------1------  ------1------
-   * 1.   2              Yankee         4.67                          null           NaN
-   * 2.   3              Xray           3.42                          null           NaN
-   * 3.   4              Whiskey        8.71                          null           NaN
-   *                     ------2------                 ------2------
-   * 4.   5              Victor         0.56                          null           NaN
-   * 5.   6              Uniform        4.30                          null           NaN
-   *                                    ------2------  ------3------
-   * 6.   null           null           null                          null           NaN
-   *      ------2------                                ------4------
-   * 7.   7              Tango          3.50                          null           NaN
+   * 1.   2              Yankee         4.67                          null           100
+   * 2.   3              Xray           3.42                          null           100
+   * 3.   4              Whiskey        8.71                          null           100
+   *                     ------2------                 ------2------                 ------2------
+   * 4.   5              Victor         0.56                          null           101
+   * 5.   6              Uniform        4.30                          null           101
+   *                                    ------2------  ------3------                 ------3------
+   * 6.   null           null           null                          null           98
+   *      ------2------                                ------4------                 ------4------
+   * 7.   7              Tango          3.50                          null           102
    *                     ------3------
-   * 8.   7              null           3.14                          null           NaN
+   * 8.   7              null           3.14                          null           103
    *      ------3------k
-   * 9.   7              null           null                          null           NaN
+   * 9.   7              null           null                          null           99
    *                                    ------3------
-   * 10.  null           null           9.99                          null           NaN
+   * 10.  null           null           9.99                          null           100
    *                     ------4------
-   * 11.  8              Sierra         8.78                          null           NaN
-   *                                                   ------5------
-   * 12.  9              Romeo          9.56                          null           NaN
-   * 13.  10             Quebec         2.71                          null           NaN
+   * 11.  8              Sierra         8.78                          null           99
+   *                                                   ------5------                 ------5------
+   * 12.  9              Romeo          9.56                          null           null
+   * 13.  10             Quebec         2.71                          null           null
    *      ------4------
-   * 14.  11             Papa           5.71                          null           NaN
-   * 15.  12             Oscar          4.09                          null           NaN
-   *                     ------5------  ------4------  ------6------
-   * 16.  13             November       null                          null           NaN
-   * 17.  14             Mike           null                          null           NaN
-   * 18.  15             Lima           0.36                          null           NaN
-   * 19.  16             Kilo           2.94                          null           NaN
-   * 20.  17             Juliett        4.23                          null           NaN
-   *      ------5------  ------6------                 ------7------
-   * 21.  18             India          null                          null           NaN
-   * 22.  19             Hotel          5.32                          null           NaN
+   * 14.  11             Papa           5.71                          null           null
+   * 15.  12             Oscar          4.09                          null           null
+   *                     ------5------  ------4------  ------6------                 ------6------
+   * 16.  13             November       null                          null           100
+   * 17.  14             Mike           null                          null           100
+   * 18.  15             Lima           0.36                          null           100
+   * 19.  16             Kilo           2.94                          null           100
+   * 20.  17             Juliett        4.23                          null           100
+   *      ------5------  ------6------                 ------7------                 ------7------
+   * 21.  18             India          null                          null           109
+   * 22.  19             Hotel          5.32                          null           108
    *                                    ------5------
-   * 23.  20             Golf           4.17                          null           NaN
-   * 24.  21             Foxtrot        7.92                          null           NaN
-   * 25.  22             Echo           7.95                          null           NaN
+   * 23.  20             Golf           4.17                          null           88
+   * 24.  21             Foxtrot        7.92                          null           87
+   * 25.  22             Echo           7.95                          null           88
    *                                   ------6------
-   * 26.  23             Delta          null                          null           NaN
+   * 26.  23             Delta          null                          null           88
    *      ------6------
-   * 27.  24             Charlie        null                          null           NaN
-   *                                                   ------8------
-   * 28.  25             Bravo          null                          null           NaN
+   * 27.  24             Charlie        null                          null           88
+   *                                                   ------8------                 ------8------
+   * 28.  25             Bravo          null                          null           null
    *                     ------7------
-   * 29.  26             Alfa           null                          null           NaN
+   * 29.  26             Alfa           null                          null           null
    * 
*/ - private static final Schema SCHEMA = new Schema( - Types.NestedField.optional(1, INT_COL, Types.IntegerType.get()), - Types.NestedField.optional(2, STR_COL, Types.StringType.get()), - Types.NestedField.optional(3, NO_NANS, Types.DoubleType.get()), - Types.NestedField.optional(4, NO_CI, Types.IntegerType.get()), - Types.NestedField.optional(5, ALL_NULLS, Types.LongType.get()), - Types.NestedField.optional(6, ALL_NANS, Types.DoubleType.get()), - Types.NestedField.optional(7, NOT_IN_FILE, Types.LongType.get()) - ); - private static final Schema SCHEMA_MISSING_COLUMN = new Schema( - Types.NestedField.optional(1, INT_COL, Types.IntegerType.get()), - Types.NestedField.optional(2, STR_COL, Types.StringType.get()), - Types.NestedField.optional(3, NO_NANS, Types.DoubleType.get()), - Types.NestedField.optional(4, NO_CI, Types.IntegerType.get()), - Types.NestedField.optional(5, ALL_NULLS, Types.LongType.get()), - Types.NestedField.optional(6, ALL_NANS, Types.DoubleType.get()) - ); - - /** END **/ - - private static final MessageType FILE_SCHEMA = ParquetSchemaUtil.convert(SCHEMA_MISSING_COLUMN, "table"); private static final RowRanges ALL_ROWS = PageSkippingHelpers.allRows(TOTAL_ROW_COUNT); private static final RowRanges NO_ROWS = PageSkippingHelpers.empty(); @@ -346,16 +451,11 @@ private void assertRowRangesEquals(RowRanges expected, RowRanges actual) { } private RowRanges calculateRowRanges(Expression expr) { - return calculateRowRanges(SCHEMA, expr, true); + return calculateRowRanges(expr, true); } private RowRanges calculateRowRanges(Expression expr, boolean caseSensitive) { - return calculateRowRanges(SCHEMA, expr, caseSensitive); - } - - private RowRanges calculateRowRanges(Schema schema, Expression expr, boolean caseSensitive) { - return new ParquetColumnIndexFilter(schema, expr, caseSensitive) - .calculateRowRanges(FILE_SCHEMA, STORE, TOTAL_ROW_COUNT); + return calculateRowRanges(SCHEMA, FILE_SCHEMA, expr, caseSensitive); } private RowRanges calculateRowRanges(Schema schema, MessageType messageType, Expression expr, boolean caseSensitive) { @@ -410,7 +510,7 @@ public void testIsNaN() { expected = selectRowRanges(NO_NANS, 0, 1, 2, 3, 4, 5); assertRowRangesEquals(expected, calculateRowRanges(isNaN(NO_NANS))); - assertRowRangesEquals(ALL_ROWS, calculateRowRanges(isNaN(ALL_NANS))); + assertRowRangesEquals(ALL_ROWS, calculateRowRanges(isNaN(NO_CI))); } @Test @@ -420,7 +520,7 @@ public void testNotNaN() { expected = ALL_ROWS; assertRowRangesEquals(expected, calculateRowRanges(notNaN(NO_NANS))); - assertRowRangesEquals(expected, calculateRowRanges(notNaN(ALL_NANS))); + assertRowRangesEquals(expected, calculateRowRanges(notNaN(NO_CI))); } @Test @@ -470,13 +570,13 @@ public void testAnd() { columns.forEach(System.out::println); expected = NO_ROWS; - expr = Expressions.and(equal(INT_COL, 1), equal(INT_COL, 2)); + expr = and(equal(INT_COL, 1), equal(INT_COL, 2)); assertRowRangesEquals(expected, calculateRowRanges(expr)); - expr = Expressions.and(equal(INT_COL, 1), equal(STR_COL, "Alfa")); + expr = and(equal(INT_COL, 1), equal(STR_COL, "Alfa")); assertRowRangesEquals(expected, calculateRowRanges(expr)); - expr = Expressions.and(equal(INT_COL, 2), equal(STR_COL, "Tango")); + expr = and(equal(INT_COL, 2), equal(STR_COL, "Tango")); expected = PageSkippingHelpers.intersection(selectRowRanges(INT_COL, 1), selectRowRanges(STR_COL, 2)); assertRowRangesEquals(expected, calculateRowRanges(expr)); } @@ -487,14 +587,14 @@ public void testOr() { Expression expr; expected = selectRowRanges(INT_COL, 0, 1); - expr = Expressions.or(equal(INT_COL, 1), equal(INT_COL, 2)); + expr = or(equal(INT_COL, 1), equal(INT_COL, 2)); assertRowRangesEquals(expected, calculateRowRanges(expr)); expected = PageSkippingHelpers.union(selectRowRanges(INT_COL, 0), selectRowRanges(STR_COL, 7)); - expr = Expressions.or(equal(INT_COL, 1), equal(STR_COL, "Alfa")); + expr = or(equal(INT_COL, 1), equal(STR_COL, "Alfa")); assertRowRangesEquals(expected, calculateRowRanges(expr)); - expr = Expressions.or(equal(INT_COL, 2), equal(STR_COL, "Tango")); + expr = or(equal(INT_COL, 2), equal(STR_COL, "Tango")); expected = PageSkippingHelpers.union(selectRowRanges(INT_COL, 1), selectRowRanges(STR_COL, 2)); assertRowRangesEquals(expected, calculateRowRanges(expr)); } @@ -533,7 +633,6 @@ public void testIntegerLtEq() { assertRowRangesEquals(expected, calculateRowRanges(lessThanOrEqual(INT_COL, 11))); expected = selectRowRanges(INT_COL, 0); - assertRowRangesEquals(expected, calculateRowRanges(lessThanOrEqual(INT_COL, 1))); } @@ -612,6 +711,15 @@ public void testStringStartsWith() { expected = NO_ROWS; assertRowRangesEquals(expected, calculateRowRanges(startsWith(STR_COL, "?"))); + assertRowRangesEquals(expected, calculateRowRanges(startsWith(STR_COL, "s"))); + + expected = selectRowRanges(STR_COL, 4); + assertRowRangesEquals(expected, calculateRowRanges(startsWith(STR_COL, "S"))); + + expected = selectRowRanges(STR_COL, 4, 6); + assertRowRangesEquals(expected, calculateRowRanges( + Expressions.or(startsWith(STR_COL, "Q"), startsWith(STR_COL, "G")))); + expected = selectRowRanges(STR_COL, 0); assertRowRangesEquals(expected, calculateRowRanges(startsWith(STR_COL, "Z"))); } @@ -625,6 +733,9 @@ public void testStringNotStartsWith() { expected = selectRowRanges(STR_COL, 0, 1, 2, 3, 4, 5, 6); assertRowRangesEquals(expected, calculateRowRanges(notStartsWith(STR_COL, "A"))); + + expected = ALL_ROWS; + assertRowRangesEquals(expected, calculateRowRanges(notStartsWith(STR_COL, "B"))); } @Test @@ -632,7 +743,7 @@ public void testIntegerIn() { RowRanges expected; Expression expr; - expr = Expressions.in(INT_COL, 7, 13); + expr = in(INT_COL, 7, 13); expected = selectRowRanges(INT_COL, 2, 3, 4); assertRowRangesEquals(expected, calculateRowRanges(expr)); } @@ -642,7 +753,7 @@ public void testIntegerNotIn() { RowRanges expected; Expression expr; - expr = Expressions.notIn(INT_COL, 7, 13); + expr = notIn(INT_COL, 7, 13); expected = ALL_ROWS; assertRowRangesEquals(expected, calculateRowRanges(expr)); } @@ -652,21 +763,25 @@ public void testSomeNullsNotEq() { RowRanges expected; Expression expr; - expr = Expressions.notEqual(STR_COL, "equal"); + expr = notEqual(STR_COL, "equal"); expected = ALL_ROWS; assertRowRangesEquals(expected, calculateRowRanges(expr)); } @Test - public void testTypePromotion() { + public void testIntTypePromotion() { RowRanges expected; Schema promotedLong = new Schema(Types.NestedField.optional(1, INT_COL, Types.LongType.get())); expected = NO_ROWS; - assertRowRangesEquals(expected, calculateRowRanges(promotedLong, equal(INT_COL, 0), true)); + RowRanges actual = + calculateRowRanges(promotedLong, FILE_SCHEMA, equal(INT_COL, 0), true, STORE, TOTAL_ROW_COUNT); + assertRowRangesEquals(expected, actual); expected = selectRowRanges(INT_COL, 2, 3); - assertRowRangesEquals(expected, calculateRowRanges(promotedLong, equal(INT_COL, 7), true)); + actual = + calculateRowRanges(promotedLong, FILE_SCHEMA, equal(INT_COL, 7), true, STORE, TOTAL_ROW_COUNT); + assertRowRangesEquals(expected, actual); } @Test @@ -676,10 +791,45 @@ public void testMissingOffsetIndex() { PrimitiveType missingOI = org.apache.parquet.schema.Types.primitive(INT32, Type.Repetition.REQUIRED) .id(1) .named("missing_oi"); - MessageType messageType = new MessageType("test", missingOI); + MessageType messageType = new MessageType("table", missingOI); expected = ALL_ROWS; - assertRowRangesEquals(expected, calculateRowRanges(SCHEMA, messageType, equal(INT_COL, 1), true)); + RowRanges actual = calculateRowRanges(SCHEMA, messageType, equal(INT_COL, 1), true, STORE, TOTAL_ROW_COUNT); + assertRowRangesEquals(expected, actual); + } + + @Test + public void testIntBackedDecimal() { + RowRanges expected; + + Expression expr = equal(INT_DECIMAL_7_2, new BigDecimal("1.00")); + expected = selectRowRanges(INT_DECIMAL_7_2, 1, 4, 6, 7); + + assertRowRangesEquals(expected, calculateRowRanges(expr)); + + expr = or(lessThan(INT_DECIMAL_7_2, new BigDecimal("1.00")), + greaterThan(INT_DECIMAL_7_2, new BigDecimal("1.01"))); + + expected = selectRowRanges(INT_DECIMAL_7_2, 0, 3, 4, 7); + assertRowRangesEquals(expected, calculateRowRanges(expr)); + } + + @Test + public void testDecimalTypePromotion() { + RowRanges expected; + + Schema promotedDecimal = new Schema(Types.NestedField.optional(6, INT_DECIMAL_7_2, Types.DecimalType.of(38, + 10))); + + Expression expr = equal(INT_DECIMAL_7_2, new BigDecimal("1.00")); + expected = selectRowRanges(INT_DECIMAL_7_2, 1, 4, 6, 7); + assertRowRangesEquals(expected, calculateRowRanges(expr)); + + expr = or(lessThan(INT_DECIMAL_7_2, new BigDecimal("1.00")), + greaterThan(INT_DECIMAL_7_2, new BigDecimal("1.01"))); + + expected = selectRowRanges(INT_DECIMAL_7_2, 0, 3, 4, 7); + assertRowRangesEquals(expected, calculateRowRanges(expr)); } // 38 precision 10 scale decimal to bytes @@ -691,28 +841,15 @@ private byte[] decimalToBytes(String decimalStr) { } @Test - public void testDecimalType() { - String intDecimal = "decimal_7_2"; - String longDecimal = "decimal_11_2"; + public void testBinaryBackedDecimal() { String binaryDecimal = "decimal_38_10"; long rowCount = 9; - ColumnIndex intDecimalCI = new CIBuilder(optional(INT32).named(intDecimal), ASCENDING) - .addPage(0, 1234, 1235) - .addPage(1, 1235, 1235) - .addPage(2, 1237, 9999) - .build(); - - OffsetIndex intDecimalOI = new OIBuilder() - .addPage(2) - .addPage(3) - .addPage(4) - .build(); ColumnIndex binaryDecimalCI = new CIBuilder(optional(FIXED_LEN_BYTE_ARRAY) .length(TypeUtil.decimalRequiredBytes(38)) .named(binaryDecimal), ASCENDING) .addPage(0, decimalToBytes("12.34"), decimalToBytes("12.35")) - .addPage(0, decimalToBytes("12.35"), decimalToBytes("12.39")) + .addPage(0, decimalToBytes("123456789.87654321"), decimalToBytes("123456789.87654323")) .build(); OffsetIndex binaryDecimalOI = new OIBuilder() @@ -724,8 +861,6 @@ public void testDecimalType() { @Override public ColumnIndex getColumnIndex(ColumnPath columnPath) { switch (columnPath.toDotString()) { - case "decimal_7_2": - return intDecimalCI; case "decimal_38_10": return binaryDecimalCI; default: @@ -736,8 +871,6 @@ public ColumnIndex getColumnIndex(ColumnPath columnPath) { @Override public OffsetIndex getOffsetIndex(ColumnPath columnPath) { switch (columnPath.toDotString()) { - case "decimal_7_2": - return intDecimalOI; case "decimal_38_10": return binaryDecimalOI; default: @@ -746,128 +879,25 @@ public OffsetIndex getOffsetIndex(ColumnPath columnPath) { } }; - MessageType messageType = new MessageType("test", - org.apache.parquet.schema.Types.primitive(INT32, Type.Repetition.OPTIONAL) - .id(1) - .named("decimal_7_2"), - org.apache.parquet.schema.Types.primitive(FIXED_LEN_BYTE_ARRAY, Type.Repetition.OPTIONAL) - .length(TypeUtil.decimalRequiredBytes(38)) - .id(3) - .named("decimal_38_10")); + MessageType messageType = org.apache.parquet.schema.Types.buildMessage() + .addField(optional(FIXED_LEN_BYTE_ARRAY).length(TypeUtil.decimalRequiredBytes(38)).id(1).as(LogicalTypeAnnotation.decimalType(10, 38)).named(binaryDecimal)) + .named("decimal"); - Schema schema = new Schema(Types.NestedField.optional(1, intDecimal, Types.DecimalType.of(7, 2)), - Types.NestedField.optional(2, longDecimal, Types.DecimalType.of(11, 2)), - Types.NestedField.optional(3, binaryDecimal, Types.DecimalType.of(38, 10))); + Schema schema = new Schema( + Types.NestedField.optional(1, binaryDecimal, Types.DecimalType.of(38, 10))); - Expression expr = Expressions.and( - lessThan(intDecimal, new BigDecimal("12.37")), - greaterThanOrEqual(intDecimal, new BigDecimal("12.35")) + Expression expr = or( + lessThan(binaryDecimal, new BigDecimal("12.34")), + greaterThanOrEqual(binaryDecimal, new BigDecimal("123456789.87654322")) ); - RowRanges expected = selectRowRanges(intDecimal, columnIndexStore, rowCount, 0, 1); + RowRanges expected = selectRowRanges(binaryDecimal, columnIndexStore, rowCount, 1); RowRanges actual = calculateRowRanges(schema, messageType, expr, true, columnIndexStore, rowCount); - - assertRowRangesEquals(expected, actual); - - expr = Expressions.and( - lessThan(binaryDecimal, new BigDecimal("12.37")), - greaterThanOrEqual(binaryDecimal, new BigDecimal("12.35")) - ); - - expected = selectRowRanges(binaryDecimal, columnIndexStore, rowCount, 0, 1); - actual = calculateRowRanges(schema, messageType, expr, true, columnIndexStore, rowCount); assertRowRangesEquals(expected, actual); - - expr = Expressions.greaterThan(binaryDecimal, new BigDecimal("99.99")); + expr = greaterThan(binaryDecimal, new BigDecimal("123456789.87654323")); expected = NO_ROWS; actual = calculateRowRanges(schema, messageType, expr, true, columnIndexStore, rowCount); assertRowRangesEquals(expected, actual); } - - private static class CIBuilder { - private static final ByteBuffer EMPTY = ByteBuffer.wrap(new byte[0]); - private final PrimitiveType type; - private final BoundaryOrder order; - boolean invalid = false; - private List nullPages = Lists.newArrayList(); - private List nullCounts = Lists.newArrayList(); - private List minValues = Lists.newArrayList(); - private List maxValues = Lists.newArrayList(); - - CIBuilder(PrimitiveType type, BoundaryOrder order) { - this.type = type; - this.order = order; - } - - CIBuilder addNullPage(long nullCount) { - nullPages.add(true); - nullCounts.add(nullCount); - minValues.add(EMPTY); - maxValues.add(EMPTY); - return this; - } - - CIBuilder addPage(long nullCount, byte[] min, byte[] max) { - nullPages.add(false); - nullCounts.add(nullCount); - minValues.add(ByteBuffer.wrap(min)); - maxValues.add(ByteBuffer.wrap(max)); - return this; - } - - CIBuilder addPage(long nullCount, int min, int max) { - nullPages.add(false); - nullCounts.add(nullCount); - minValues.add(ByteBuffer.allocate(Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN).putInt(0, min)); - maxValues.add(ByteBuffer.allocate(Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN).putInt(0, max)); - return this; - } - - CIBuilder addPage(long nullCount, long min, long max) { - nullPages.add(false); - nullCounts.add(nullCount); - minValues.add(ByteBuffer.allocate(Long.BYTES).order(ByteOrder.LITTLE_ENDIAN).putLong(0, min)); - maxValues.add(ByteBuffer.allocate(Long.BYTES).order(ByteOrder.LITTLE_ENDIAN).putLong(0, max)); - return this; - } - - CIBuilder addPage(long nullCount, String min, String max) { - nullPages.add(false); - nullCounts.add(nullCount); - minValues.add(ByteBuffer.wrap(min.getBytes(UTF_8))); - maxValues.add(ByteBuffer.wrap(max.getBytes(UTF_8))); - return this; - } - - CIBuilder addPage(long nullCount, double min, double max) { - if (Double.isNaN(min) || Double.isNaN(max)) { - invalid = true; - return this; - } - - nullPages.add(false); - nullCounts.add(nullCount); - minValues.add(ByteBuffer.allocate(Double.BYTES).order(ByteOrder.LITTLE_ENDIAN).putDouble(0, min)); - maxValues.add(ByteBuffer.allocate(Double.BYTES).order(ByteOrder.LITTLE_ENDIAN).putDouble(0, max)); - return this; - } - - ColumnIndex build() { - return invalid ? null : ColumnIndexBuilder.build(type, order, nullPages, nullCounts, minValues, maxValues); - } - } - - private static class OIBuilder { - private final OffsetIndexBuilder builder = OffsetIndexBuilder.getBuilder(); - - OIBuilder addPage(long rowCount) { - builder.add(1234, rowCount); - return this; - } - - OffsetIndex build() { - return builder.build(); - } - } } From 81d003d43bfa334516cb67a4519394e781e6cc72 Mon Sep 17 00:00:00 2001 From: ZhongYujiang <42907416+zhongyujiang@users.noreply.github.com> Date: Tue, 28 Feb 2023 20:49:38 +0800 Subject: [PATCH 05/21] Minor fix. --- .../org/apache/iceberg/parquet/TestColumnIndexFilter.java | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java index 7b3c3969b5cd..dd9afe5d9e60 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java @@ -455,11 +455,7 @@ private RowRanges calculateRowRanges(Expression expr) { } private RowRanges calculateRowRanges(Expression expr, boolean caseSensitive) { - return calculateRowRanges(SCHEMA, FILE_SCHEMA, expr, caseSensitive); - } - - private RowRanges calculateRowRanges(Schema schema, MessageType messageType, Expression expr, boolean caseSensitive) { - return calculateRowRanges(schema, messageType, expr, caseSensitive, STORE, TOTAL_ROW_COUNT); + return calculateRowRanges(SCHEMA, FILE_SCHEMA, expr, caseSensitive, STORE, TOTAL_ROW_COUNT); } private RowRanges calculateRowRanges(Schema schema, MessageType messageType, Expression expr, From 125bcfc126eee0b2824e8c246f512a378fcde3c2 Mon Sep 17 00:00:00 2001 From: ZhongYujiang <42907416+zhongyujiang@users.noreply.github.com> Date: Tue, 28 Feb 2023 20:51:01 +0800 Subject: [PATCH 06/21] Spotless. --- .../iceberg/parquet/PageSkippingHelpers.java | 72 ++- .../parquet/ParquetColumnIndexFilter.java | 612 ++++++++++-------- .../parquet/TestColumnIndexFilter.java | 568 ++++++++-------- 3 files changed, 682 insertions(+), 570 deletions(-) diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/PageSkippingHelpers.java b/parquet/src/main/java/org/apache/iceberg/parquet/PageSkippingHelpers.java index 36f1d5bf804c..c56bdf0b2d7d 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/PageSkippingHelpers.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/PageSkippingHelpers.java @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; import java.util.List; @@ -32,16 +31,12 @@ import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; import org.apache.parquet.internal.filter2.columnindex.RowRanges; -/** - * Helper methods for page skipping. - */ +/** Helper methods for page skipping. */ class PageSkippingHelpers { - private PageSkippingHelpers() { - } + private PageSkippingHelpers() {} - private static final DynConstructors.Ctor RANGES_LIST_CTOR = DynConstructors.builder() - .hiddenImpl(RowRanges.class, List.class) - .build(); + private static final DynConstructors.Ctor RANGES_LIST_CTOR = + DynConstructors.builder().hiddenImpl(RowRanges.class, List.class).build(); private static final RowRanges EMPTY = RANGES_LIST_CTOR.newInstance(ImmutableList.of()); @@ -49,49 +44,57 @@ static RowRanges empty() { return EMPTY; } - private static final DynMethods.StaticMethod UNION = DynMethods.builder("union") - .hiddenImpl(RowRanges.class, RowRanges.class, RowRanges.class) - .buildStatic(); + private static final DynMethods.StaticMethod UNION = + DynMethods.builder("union") + .hiddenImpl(RowRanges.class, RowRanges.class, RowRanges.class) + .buildStatic(); static RowRanges union(RowRanges left, RowRanges right) { return UNION.invoke(left, right); } - private static final DynMethods.StaticMethod INTERSECTION = DynMethods.builder("intersection") - .hiddenImpl(RowRanges.class, RowRanges.class, RowRanges.class) - .buildStatic(); + private static final DynMethods.StaticMethod INTERSECTION = + DynMethods.builder("intersection") + .hiddenImpl(RowRanges.class, RowRanges.class, RowRanges.class) + .buildStatic(); static RowRanges intersection(RowRanges left, RowRanges right) { return INTERSECTION.invoke(left, right); } - private static final DynMethods.StaticMethod ROW_RANGES_CREATE = DynMethods.builder("create") - .hiddenImpl(RowRanges.class, long.class, PrimitiveIterator.OfInt.class, OffsetIndex.class) - .buildStatic(); + private static final DynMethods.StaticMethod ROW_RANGES_CREATE = + DynMethods.builder("create") + .hiddenImpl(RowRanges.class, long.class, PrimitiveIterator.OfInt.class, OffsetIndex.class) + .buildStatic(); - static RowRanges createRowRanges(long rowCount, PrimitiveIterator.OfInt pageIndexes, OffsetIndex offsetIndex) { + static RowRanges createRowRanges( + long rowCount, PrimitiveIterator.OfInt pageIndexes, OffsetIndex offsetIndex) { return ROW_RANGES_CREATE.invoke(rowCount, pageIndexes, offsetIndex); } - private static final DynMethods.StaticMethod ROW_RANGES_CREATE_SINGLE = DynMethods.builder("createSingle") - .hiddenImpl(RowRanges.class, long.class) - .buildStatic(); + private static final DynMethods.StaticMethod ROW_RANGES_CREATE_SINGLE = + DynMethods.builder("createSingle").hiddenImpl(RowRanges.class, long.class).buildStatic(); static RowRanges allRows(long rowCount) { return ROW_RANGES_CREATE_SINGLE.invoke(rowCount); } - private static final DynMethods.StaticMethod INDEX_ITERATOR_ALL = DynMethods.builder("all") - .hiddenImpl("org.apache.parquet.internal.column.columnindex.IndexIterator", int.class) - .buildStatic(); + private static final DynMethods.StaticMethod INDEX_ITERATOR_ALL = + DynMethods.builder("all") + .hiddenImpl("org.apache.parquet.internal.column.columnindex.IndexIterator", int.class) + .buildStatic(); static PrimitiveIterator.OfInt allPageIndexes(int pageCount) { return INDEX_ITERATOR_ALL.invoke(pageCount); } - private static final DynMethods.StaticMethod INDEX_ITERATOR_FILTER = DynMethods.builder("filter") - .hiddenImpl("org.apache.parquet.internal.column.columnindex.IndexIterator", int.class, IntPredicate.class) - .buildStatic(); + private static final DynMethods.StaticMethod INDEX_ITERATOR_FILTER = + DynMethods.builder("filter") + .hiddenImpl( + "org.apache.parquet.internal.column.columnindex.IndexIterator", + int.class, + IntPredicate.class) + .buildStatic(); static PrimitiveIterator.OfInt filterPageIndexes(int pageCount, IntPredicate filter) { return INDEX_ITERATOR_FILTER.invoke(pageCount, filter); @@ -108,13 +111,18 @@ static ColumnIndexStore getColumnIndexStore(ParquetFileReader reader, int blockI private static final DynMethods.UnboundMethod INTERNAL_READ_FILTERED_ROW_GROUP = DynMethods.builder("internalReadFilteredRowGroup") - .hiddenImpl("org.apache.parquet.hadoop.ParquetFileReader", - BlockMetaData.class, RowRanges.class, ColumnIndexStore.class) + .hiddenImpl( + "org.apache.parquet.hadoop.ParquetFileReader", + BlockMetaData.class, + RowRanges.class, + ColumnIndexStore.class) .build(); - static PageReadStore internalReadFilteredRowGroup(ParquetFileReader reader, int blockIndex, RowRanges rowRanges) { + static PageReadStore internalReadFilteredRowGroup( + ParquetFileReader reader, int blockIndex, RowRanges rowRanges) { ColumnIndexStore columnIndexStore = GET_COLUMN_INDEX_STORE.invoke(reader, blockIndex); BlockMetaData blockMetaData = reader.getRowGroups().get(blockIndex); - return INTERNAL_READ_FILTERED_ROW_GROUP.invoke(reader, blockMetaData, rowRanges, columnIndexStore); + return INTERNAL_READ_FILTERED_ROW_GROUP.invoke( + reader, blockMetaData, rowRanges, columnIndexStore); } } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java index 6f0ad1553694..be44ef68c042 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java @@ -16,9 +16,14 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; +import static org.apache.iceberg.parquet.PageSkippingHelpers.allPageIndexes; +import static org.apache.iceberg.parquet.PageSkippingHelpers.allRows; +import static org.apache.iceberg.parquet.PageSkippingHelpers.filterPageIndexes; +import static org.apache.iceberg.parquet.PageSkippingHelpers.intersection; +import static org.apache.iceberg.parquet.PageSkippingHelpers.union; + import java.math.BigDecimal; import java.math.BigInteger; import java.nio.ByteBuffer; @@ -56,12 +61,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.iceberg.parquet.PageSkippingHelpers.allPageIndexes; -import static org.apache.iceberg.parquet.PageSkippingHelpers.allRows; -import static org.apache.iceberg.parquet.PageSkippingHelpers.filterPageIndexes; -import static org.apache.iceberg.parquet.PageSkippingHelpers.intersection; -import static org.apache.iceberg.parquet.PageSkippingHelpers.union; - public class ParquetColumnIndexFilter { private static final Logger LOG = LoggerFactory.getLogger(ParquetColumnIndexFilter.class); @@ -76,13 +75,15 @@ public ParquetColumnIndexFilter(Schema schema, Expression unbound, boolean caseS /** * Calculates the row ranges containing the indexes of the rows might match the expression. + * * @param fileSchema schema of file * @param columnIndexStore the store for providing column/offset indexes - * @param rowCount the total number of rows in the row-group - * @return the ranges of the possible matching row indexes; the returned ranges will contain all the rows - * if any of the required offset index is missing + * @param rowCount the total number of rows in the row-group + * @return the ranges of the possible matching row indexes; the returned ranges will contain all + * the rows if any of the required offset index is missing */ - public RowRanges calculateRowRanges(MessageType fileSchema, ColumnIndexStore columnIndexStore, long rowCount) { + public RowRanges calculateRowRanges( + MessageType fileSchema, ColumnIndexStore columnIndexStore, long rowCount) { try { return new ColumnIndexEvalVisitor(fileSchema, columnIndexStore, rowCount).eval(); } catch (ColumnIndexStore.MissingOffsetIndexException e) { @@ -95,7 +96,8 @@ public RowRanges calculateRowRanges(MessageType fileSchema, ColumnIndexStore col private static final boolean ROWS_CANNOT_MATCH = false; private static final RowRanges NO_ROWS = PageSkippingHelpers.empty(); - private class ColumnIndexEvalVisitor extends ExpressionVisitors.BoundExpressionVisitor { + private class ColumnIndexEvalVisitor + extends ExpressionVisitors.BoundExpressionVisitor { private final Map idToColumn = Maps.newHashMap(); private final Map idToColumnIndex = Maps.newHashMap(); @@ -107,7 +109,8 @@ private class ColumnIndexEvalVisitor extends ExpressionVisitors.BoundExpressionV private final ColumnIndexStore columnIndexStore; private final long rowCount; - private ColumnIndexEvalVisitor(MessageType fileSchema, ColumnIndexStore columnIndexStore, long rowCount) { + private ColumnIndexEvalVisitor( + MessageType fileSchema, ColumnIndexStore columnIndexStore, long rowCount) { this.allRows = allRows(rowCount); this.columnIndexStore = columnIndexStore; this.rowCount = rowCount; @@ -163,14 +166,16 @@ public RowRanges or(RowRanges left, RowRanges right) { public RowRanges isNull(BoundReference ref) { int id = ref.fieldId(); - Function func = columnIndex -> { - if (columnIndex.hasNullCounts()) { - return filterPageIndexes(columnIndex.pageCount(), columnIndex::containsNull); - } else { - // Searching for nulls so if we don't have null related statistics we have to return all pages - return allPageIndexes(columnIndex.pageCount()); - } - }; + Function func = + columnIndex -> { + if (columnIndex.hasNullCounts()) { + return filterPageIndexes(columnIndex.pageCount(), columnIndex::containsNull); + } else { + // Searching for nulls so if we don't have null related statistics we have to return + // all pages + return allPageIndexes(columnIndex.pageCount()); + } + }; return applyPredicate(id, func, ROWS_MIGHT_MATCH); } @@ -204,7 +209,8 @@ public RowRanges isNaN(BoundReference ref) { @Override public RowRanges notNaN(BoundReference ref) { - // Parquet column index does not contain statistics about NaN values, so cannot filter out any pages. + // Parquet column index does not contain statistics about NaN values, so cannot filter out any + // pages. return allRows; } @@ -212,23 +218,24 @@ public RowRanges notNaN(BoundReference ref) { public RowRanges lt(BoundReference ref, Literal lit) { int id = ref.fieldId(); - Function func = columnIndex -> { - - IntPredicate filter = pageIndex -> { - if (columnIndex.isNullPage(pageIndex)) { - return ROWS_CANNOT_MATCH; - } - - T lower = columnIndex.min(pageIndex); - if (lit.comparator().compare(lower, lit.value()) >= 0) { - return ROWS_CANNOT_MATCH; - } - - return ROWS_MIGHT_MATCH; - }; - - return filterPageIndexes(columnIndex.pageCount(), filter); - }; + Function func = + columnIndex -> { + IntPredicate filter = + pageIndex -> { + if (columnIndex.isNullPage(pageIndex)) { + return ROWS_CANNOT_MATCH; + } + + T lower = columnIndex.min(pageIndex); + if (lit.comparator().compare(lower, lit.value()) >= 0) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + }; + + return filterPageIndexes(columnIndex.pageCount(), filter); + }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); } @@ -237,23 +244,24 @@ public RowRanges lt(BoundReference ref, Literal lit) { public RowRanges ltEq(BoundReference ref, Literal lit) { int id = ref.fieldId(); - Function func = columnIndex -> { - - IntPredicate filter = pageIndex -> { - if (columnIndex.isNullPage(pageIndex)) { - return ROWS_CANNOT_MATCH; - } - - T lower = columnIndex.min(pageIndex); - if (lit.comparator().compare(lower, lit.value()) > 0) { - return ROWS_CANNOT_MATCH; - } - - return ROWS_MIGHT_MATCH; - }; - - return filterPageIndexes(columnIndex.pageCount(), filter); - }; + Function func = + columnIndex -> { + IntPredicate filter = + pageIndex -> { + if (columnIndex.isNullPage(pageIndex)) { + return ROWS_CANNOT_MATCH; + } + + T lower = columnIndex.min(pageIndex); + if (lit.comparator().compare(lower, lit.value()) > 0) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + }; + + return filterPageIndexes(columnIndex.pageCount(), filter); + }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); } @@ -262,22 +270,23 @@ public RowRanges ltEq(BoundReference ref, Literal lit) { public RowRanges gt(BoundReference ref, Literal lit) { int id = ref.fieldId(); - Function func = columnIndex -> { - - IntPredicate filter = pageIndex -> { - if (columnIndex.isNullPage(pageIndex)) { - return ROWS_CANNOT_MATCH; - } - - T upper = columnIndex.max(pageIndex); - if (lit.comparator().compare(upper, lit.value()) <= 0) { - return ROWS_CANNOT_MATCH; - } - - return ROWS_MIGHT_MATCH; - }; - return filterPageIndexes(columnIndex.pageCount(), filter); - }; + Function func = + columnIndex -> { + IntPredicate filter = + pageIndex -> { + if (columnIndex.isNullPage(pageIndex)) { + return ROWS_CANNOT_MATCH; + } + + T upper = columnIndex.max(pageIndex); + if (lit.comparator().compare(upper, lit.value()) <= 0) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + }; + return filterPageIndexes(columnIndex.pageCount(), filter); + }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); } @@ -286,22 +295,23 @@ public RowRanges gt(BoundReference ref, Literal lit) { public RowRanges gtEq(BoundReference ref, Literal lit) { int id = ref.fieldId(); - Function func = columnIndex -> { - - IntPredicate filter = pageIndex -> { - if (columnIndex.isNullPage(pageIndex)) { - return ROWS_CANNOT_MATCH; - } - - T upper = columnIndex.max(pageIndex); - if (lit.comparator().compare(upper, lit.value()) < 0) { - return ROWS_CANNOT_MATCH; - } - - return ROWS_MIGHT_MATCH; - }; - return filterPageIndexes(columnIndex.pageCount(), filter); - }; + Function func = + columnIndex -> { + IntPredicate filter = + pageIndex -> { + if (columnIndex.isNullPage(pageIndex)) { + return ROWS_CANNOT_MATCH; + } + + T upper = columnIndex.max(pageIndex); + if (lit.comparator().compare(upper, lit.value()) < 0) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + }; + return filterPageIndexes(columnIndex.pageCount(), filter); + }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); } @@ -310,28 +320,29 @@ public RowRanges gtEq(BoundReference ref, Literal lit) { public RowRanges eq(BoundReference ref, Literal lit) { int id = ref.fieldId(); - Function func = columnIndex -> { - - IntPredicate filter = pageIndex -> { - if (columnIndex.isNullPage(pageIndex)) { - return ROWS_CANNOT_MATCH; - } - - T lower = columnIndex.min(pageIndex); - if (lit.comparator().compare(lower, lit.value()) > 0) { - return ROWS_CANNOT_MATCH; - } - - T upper = columnIndex.max(pageIndex); - if (lit.comparator().compare(upper, lit.value()) < 0) { - return ROWS_CANNOT_MATCH; - } - - return ROWS_MIGHT_MATCH; - }; - - return filterPageIndexes(columnIndex.pageCount(), filter); - }; + Function func = + columnIndex -> { + IntPredicate filter = + pageIndex -> { + if (columnIndex.isNullPage(pageIndex)) { + return ROWS_CANNOT_MATCH; + } + + T lower = columnIndex.min(pageIndex); + if (lit.comparator().compare(lower, lit.value()) > 0) { + return ROWS_CANNOT_MATCH; + } + + T upper = columnIndex.max(pageIndex); + if (lit.comparator().compare(upper, lit.value()) < 0) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + }; + + return filterPageIndexes(columnIndex.pageCount(), filter); + }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); } @@ -346,28 +357,29 @@ public RowRanges in(BoundReference ref, Set literalSet) { int id = ref.fieldId(); Pair minMax = minMax(ref.comparator(), literalSet); - Function func = columnIndex -> { - - IntPredicate filter = pageIndex -> { - if (columnIndex.isNullPage(pageIndex)) { - return ROWS_CANNOT_MATCH; - } - - T lower = columnIndex.min(pageIndex); - if (ref.comparator().compare(lower, minMax.second()) > 0) { - return ROWS_CANNOT_MATCH; - } - - T upper = columnIndex.max(pageIndex); - if (ref.comparator().compare(upper, minMax.first()) < 0) { - return ROWS_CANNOT_MATCH; - } - - return ROWS_MIGHT_MATCH; - }; - - return filterPageIndexes(columnIndex.pageCount(), filter); - }; + Function func = + columnIndex -> { + IntPredicate filter = + pageIndex -> { + if (columnIndex.isNullPage(pageIndex)) { + return ROWS_CANNOT_MATCH; + } + + T lower = columnIndex.min(pageIndex); + if (ref.comparator().compare(lower, minMax.second()) > 0) { + return ROWS_CANNOT_MATCH; + } + + T upper = columnIndex.max(pageIndex); + if (ref.comparator().compare(upper, minMax.first()) < 0) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + }; + + return filterPageIndexes(columnIndex.pageCount(), filter); + }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); } @@ -401,38 +413,45 @@ public RowRanges notIn(BoundReference ref, Set literalSet) { public RowRanges startsWith(BoundReference ref, Literal lit) { int id = ref.fieldId(); - Function func = columnIndex -> { - - ByteBuffer prefixAsBytes = lit.toByteBuffer(); - Comparator comparator = Comparators.unsignedBytes(); - - IntPredicate filter = pageIndex -> { - if (columnIndex.isNullPage(pageIndex)) { - return ROWS_CANNOT_MATCH; - } - - ByteBuffer lower = columnIndex.minBuffer(pageIndex); - - // truncate lower bound so that its length in bytes is not greater than the length of prefix - int lowerLength = Math.min(prefixAsBytes.remaining(), lower.remaining()); - int lowerCmp = comparator.compare(BinaryUtil.truncateBinary(lower, lowerLength), prefixAsBytes); - if (lowerCmp > 0) { - return ROWS_CANNOT_MATCH; - } - - ByteBuffer upper = columnIndex.maxBuffer(pageIndex); - // truncate upper bound so that its length in bytes is not greater than the length of prefix - int upperLength = Math.min(prefixAsBytes.remaining(), upper.remaining()); - int upperCmp = comparator.compare(BinaryUtil.truncateBinary(upper, upperLength), prefixAsBytes); - if (upperCmp < 0) { - return ROWS_CANNOT_MATCH; - } - - return ROWS_MIGHT_MATCH; - }; - - return filterPageIndexes(columnIndex.pageCount(), filter); - }; + Function func = + columnIndex -> { + ByteBuffer prefixAsBytes = lit.toByteBuffer(); + Comparator comparator = Comparators.unsignedBytes(); + + IntPredicate filter = + pageIndex -> { + if (columnIndex.isNullPage(pageIndex)) { + return ROWS_CANNOT_MATCH; + } + + ByteBuffer lower = columnIndex.minBuffer(pageIndex); + + // truncate lower bound so that its length in bytes is not greater than the length + // of prefix + int lowerLength = Math.min(prefixAsBytes.remaining(), lower.remaining()); + int lowerCmp = + comparator.compare( + BinaryUtil.truncateBinary(lower, lowerLength), prefixAsBytes); + if (lowerCmp > 0) { + return ROWS_CANNOT_MATCH; + } + + ByteBuffer upper = columnIndex.maxBuffer(pageIndex); + // truncate upper bound so that its length in bytes is not greater than the length + // of prefix + int upperLength = Math.min(prefixAsBytes.remaining(), upper.remaining()); + int upperCmp = + comparator.compare( + BinaryUtil.truncateBinary(upper, upperLength), prefixAsBytes); + if (upperCmp < 0) { + return ROWS_CANNOT_MATCH; + } + + return ROWS_MIGHT_MATCH; + }; + + return filterPageIndexes(columnIndex.pageCount(), filter); + }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); } @@ -441,58 +460,70 @@ public RowRanges startsWith(BoundReference ref, Literal lit) { public RowRanges notStartsWith(BoundReference ref, Literal lit) { int id = ref.fieldId(); - Function func = columnIndex -> { - IntPredicate filter; - if (columnIndex.hasNullCounts()) { - ByteBuffer prefixAsBytes = lit.toByteBuffer(); - Comparator comparator = Comparators.unsignedBytes(); - - filter = pageIndex -> { - if (columnIndex.containsNull(pageIndex)) { - return ROWS_MIGHT_MATCH; - } - - ByteBuffer lower = columnIndex.minBuffer(pageIndex); - // if lower is shorter than the prefix, it can't start with the prefix - if (lower.remaining() < prefixAsBytes.remaining()) { - return ROWS_MIGHT_MATCH; - } - - // truncate lower bound so that its length in bytes is not greater than the length of prefix - int cmp = comparator.compare(BinaryUtil.truncateBinary(lower, prefixAsBytes.remaining()), prefixAsBytes); - - if (cmp == 0) { - ByteBuffer upper = columnIndex.maxBuffer(pageIndex); - // the lower bound starts with the prefix; check the upper bound - // if upper is shorter than the prefix, it can't start with the prefix - if (upper.remaining() < prefixAsBytes.remaining()) { - return ROWS_MIGHT_MATCH; - } - - // truncate upper bound so that its length in bytes is not greater than the length of prefix - cmp = comparator.compare(BinaryUtil.truncateBinary(upper, prefixAsBytes.remaining()), prefixAsBytes); - if (cmp == 0) { - // both bounds match the prefix, so all rows must match the prefix and none do not match - return ROWS_CANNOT_MATCH; - } + Function func = + columnIndex -> { + IntPredicate filter; + if (columnIndex.hasNullCounts()) { + ByteBuffer prefixAsBytes = lit.toByteBuffer(); + Comparator comparator = Comparators.unsignedBytes(); + + filter = + pageIndex -> { + if (columnIndex.containsNull(pageIndex)) { + return ROWS_MIGHT_MATCH; + } + + ByteBuffer lower = columnIndex.minBuffer(pageIndex); + // if lower is shorter than the prefix, it can't start with the prefix + if (lower.remaining() < prefixAsBytes.remaining()) { + return ROWS_MIGHT_MATCH; + } + + // truncate lower bound so that its length in bytes is not greater than the + // length of prefix + int cmp = + comparator.compare( + BinaryUtil.truncateBinary(lower, prefixAsBytes.remaining()), + prefixAsBytes); + + if (cmp == 0) { + ByteBuffer upper = columnIndex.maxBuffer(pageIndex); + // the lower bound starts with the prefix; check the upper bound + // if upper is shorter than the prefix, it can't start with the prefix + if (upper.remaining() < prefixAsBytes.remaining()) { + return ROWS_MIGHT_MATCH; + } + + // truncate upper bound so that its length in bytes is not greater than the + // length of prefix + cmp = + comparator.compare( + BinaryUtil.truncateBinary(upper, prefixAsBytes.remaining()), + prefixAsBytes); + if (cmp == 0) { + // both bounds match the prefix, so all rows must match the prefix and none + // do not match + return ROWS_CANNOT_MATCH; + } + } + + return ROWS_MIGHT_MATCH; + }; + } else { + // Return all pages if we don't have null counts statistics + filter = pageIndex -> ROWS_MIGHT_MATCH; } - return ROWS_MIGHT_MATCH; + return filterPageIndexes(columnIndex.pageCount(), filter); }; - } else { - // Return all pages if we don't have null counts statistics - filter = pageIndex -> ROWS_MIGHT_MATCH; - } - - return filterPageIndexes(columnIndex.pageCount(), filter); - }; return applyPredicate(id, func, ROWS_MIGHT_MATCH); } - private RowRanges applyPredicate(int columnId, - Function func, - boolean missingColumnMightMatch) { + private RowRanges applyPredicate( + int columnId, + Function func, + boolean missingColumnMightMatch) { if (!idToColumn.containsKey(columnId)) { return missingColumnMightMatch ? allRows : NO_ROWS; @@ -504,8 +535,9 @@ private RowRanges applyPredicate(int columnId, OffsetIndex offsetIndex = offsetIndex(columnId); ParquetColumnIndex columnIndex = columnIndex(columnId); if (columnIndex == null) { - LOG.info("No column index for column {} is available; Unable to filter on this column", - idToColumn.get(columnId)); + LOG.info( + "No column index for column {} is available; Unable to filter on this column", + idToColumn.get(columnId)); return allRows; } @@ -514,7 +546,8 @@ private RowRanges applyPredicate(int columnId, // Assumes that the column corresponding to the id exists in the file. private OffsetIndex offsetIndex(int columnId) { - return idToOffsetIndex.computeIfAbsent(columnId, k -> columnIndexStore.getOffsetIndex(idToColumn.get(k))); + return idToOffsetIndex.computeIfAbsent( + columnId, k -> columnIndexStore.getOffsetIndex(idToColumn.get(k))); } // Assumes that the column corresponding to the id exists in the file. @@ -524,7 +557,9 @@ private ParquetColumnIndex columnIndex(int columnId) { if (wrapper == null) { ColumnIndex columnIndex = columnIndexStore.getColumnIndex(idToColumn.get(columnId)); if (columnIndex != null) { - wrapper = new ParquetColumnIndex(columnIndex, parquetTypes.get(columnId), icebergTypes.get(columnId)); + wrapper = + new ParquetColumnIndex( + columnIndex, parquetTypes.get(columnId), icebergTypes.get(columnId)); idToColumnIndex.put(columnId, wrapper); } } @@ -534,7 +569,8 @@ private ParquetColumnIndex columnIndex(int columnId) { } /** - * A wrapper for ColumnIndex, which will cache statistics data and convert min max buffers to Iceberg type values. + * A wrapper for ColumnIndex, which will cache statistics data and convert min max buffers to + * Iceberg type values. */ private static class ParquetColumnIndex { private final ColumnIndex columnIndex; @@ -546,7 +582,8 @@ private static class ParquetColumnIndex { private List maxBuffers; private List nullCounts; // optional field - private ParquetColumnIndex(ColumnIndex columnIndex, PrimitiveType primitiveType, Type.PrimitiveType icebergType) { + private ParquetColumnIndex( + ColumnIndex columnIndex, PrimitiveType primitiveType, Type.PrimitiveType icebergType) { this.columnIndex = columnIndex; this.primitiveType = primitiveType; this.icebergType = icebergType; @@ -613,70 +650,89 @@ private int pageCount() { } @SuppressWarnings("unchecked") - private T fromBytes(ByteBuffer bytes, PrimitiveType primitiveType, Type.PrimitiveType icebergType) { + private T fromBytes( + ByteBuffer bytes, PrimitiveType primitiveType, Type.PrimitiveType icebergType) { LogicalTypeAnnotation logicalTypeAnnotation = primitiveType.getLogicalTypeAnnotation(); - Optional converted = logicalTypeAnnotation == null ? Optional.empty() : logicalTypeAnnotation - .accept(new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { - @Override - public Optional visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) { - return Optional.of(StandardCharsets.UTF_8.decode(bytes)); - } - - @Override - public Optional visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) { - return Optional.of(StandardCharsets.UTF_8.decode(bytes)); - } - - @Override - public Optional visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalType) { - switch (primitiveType.getPrimitiveTypeName()) { - case INT32: - return Optional.of(new BigDecimal(BigInteger.valueOf(bytes.getInt(0)), decimalType.getScale())); - case INT64: - return Optional.of(new BigDecimal(BigInteger.valueOf(bytes.getLong(0)), decimalType.getScale())); - case BINARY: - case FIXED_LEN_BYTE_ARRAY: - return Optional.of(new BigDecimal(new BigInteger(ByteBuffers.toByteArray(bytes)), decimalType.getScale())); - } - return Optional.empty(); - } - - @Override - public Optional visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { - switch (timeLogicalType.getUnit()) { - case MILLIS: - return Optional.of(((long) bytes.getInt(0)) * 1000L); - case MICROS: - return Optional.of(bytes.getLong(0)); - case NANOS: - return Optional.of(Math.floorDiv(bytes.getLong(0), 1000)); - } - return Optional.empty(); - } - - @Override - public Optional visit(LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { - switch (timestampLogicalType.getUnit()) { - case MILLIS: - return Optional.of(bytes.getLong(0) * 1000); - case MICROS: - return Optional.of(bytes.getLong(0)); - case NANOS: - return Optional.of(Math.floorDiv(bytes.getLong(0), 1000)); - } - return Optional.empty(); - } - - @Override - public Optional visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) { - return Optional.of(StandardCharsets.UTF_8.decode(bytes)); - } - - @Override - public Optional visit(LogicalTypeAnnotation.UUIDLogicalTypeAnnotation uuidLogicalType) { - return LogicalTypeAnnotation.LogicalTypeAnnotationVisitor.super.visit(uuidLogicalType); - } - }); + Optional converted = + logicalTypeAnnotation == null + ? Optional.empty() + : logicalTypeAnnotation.accept( + new LogicalTypeAnnotation.LogicalTypeAnnotationVisitor() { + @Override + public Optional visit( + LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) { + return Optional.of(StandardCharsets.UTF_8.decode(bytes)); + } + + @Override + public Optional visit( + LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) { + return Optional.of(StandardCharsets.UTF_8.decode(bytes)); + } + + @Override + public Optional visit( + LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalType) { + switch (primitiveType.getPrimitiveTypeName()) { + case INT32: + return Optional.of( + new BigDecimal( + BigInteger.valueOf(bytes.getInt(0)), decimalType.getScale())); + case INT64: + return Optional.of( + new BigDecimal( + BigInteger.valueOf(bytes.getLong(0)), decimalType.getScale())); + case BINARY: + case FIXED_LEN_BYTE_ARRAY: + return Optional.of( + new BigDecimal( + new BigInteger(ByteBuffers.toByteArray(bytes)), + decimalType.getScale())); + } + return Optional.empty(); + } + + @Override + public Optional visit( + LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { + switch (timeLogicalType.getUnit()) { + case MILLIS: + return Optional.of(((long) bytes.getInt(0)) * 1000L); + case MICROS: + return Optional.of(bytes.getLong(0)); + case NANOS: + return Optional.of(Math.floorDiv(bytes.getLong(0), 1000)); + } + return Optional.empty(); + } + + @Override + public Optional visit( + LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { + switch (timestampLogicalType.getUnit()) { + case MILLIS: + return Optional.of(bytes.getLong(0) * 1000); + case MICROS: + return Optional.of(bytes.getLong(0)); + case NANOS: + return Optional.of(Math.floorDiv(bytes.getLong(0), 1000)); + } + return Optional.empty(); + } + + @Override + public Optional visit( + LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) { + return Optional.of(StandardCharsets.UTF_8.decode(bytes)); + } + + @Override + public Optional visit( + LogicalTypeAnnotation.UUIDLogicalTypeAnnotation uuidLogicalType) { + return LogicalTypeAnnotation.LogicalTypeAnnotationVisitor.super.visit( + uuidLogicalType); + } + }); if (converted.isPresent()) { return (T) converted.get(); diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java index dd9afe5d9e60..258fde17c07e 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java @@ -16,9 +16,37 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.parquet; +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.iceberg.expressions.Expressions.and; +import static org.apache.iceberg.expressions.Expressions.equal; +import static org.apache.iceberg.expressions.Expressions.greaterThan; +import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.in; +import static org.apache.iceberg.expressions.Expressions.isNaN; +import static org.apache.iceberg.expressions.Expressions.isNull; +import static org.apache.iceberg.expressions.Expressions.lessThan; +import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; +import static org.apache.iceberg.expressions.Expressions.not; +import static org.apache.iceberg.expressions.Expressions.notEqual; +import static org.apache.iceberg.expressions.Expressions.notIn; +import static org.apache.iceberg.expressions.Expressions.notNaN; +import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.notStartsWith; +import static org.apache.iceberg.expressions.Expressions.or; +import static org.apache.iceberg.expressions.Expressions.startsWith; +import static org.apache.parquet.internal.column.columnindex.BoundaryOrder.ASCENDING; +import static org.apache.parquet.internal.column.columnindex.BoundaryOrder.DESCENDING; +import static org.apache.parquet.internal.column.columnindex.BoundaryOrder.UNORDERED; +import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; +import static org.apache.parquet.schema.Types.optional; + import java.math.BigDecimal; import java.nio.ByteBuffer; import java.nio.ByteOrder; @@ -49,39 +77,8 @@ import org.junit.Assert; import org.junit.Test; -import static java.nio.charset.StandardCharsets.UTF_8; -import static org.apache.iceberg.expressions.Expressions.and; -import static org.apache.iceberg.expressions.Expressions.equal; -import static org.apache.iceberg.expressions.Expressions.greaterThan; -import static org.apache.iceberg.expressions.Expressions.greaterThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.in; -import static org.apache.iceberg.expressions.Expressions.isNaN; -import static org.apache.iceberg.expressions.Expressions.isNull; -import static org.apache.iceberg.expressions.Expressions.lessThan; -import static org.apache.iceberg.expressions.Expressions.lessThanOrEqual; -import static org.apache.iceberg.expressions.Expressions.not; -import static org.apache.iceberg.expressions.Expressions.notEqual; -import static org.apache.iceberg.expressions.Expressions.notIn; -import static org.apache.iceberg.expressions.Expressions.notNaN; -import static org.apache.iceberg.expressions.Expressions.notNull; -import static org.apache.iceberg.expressions.Expressions.notStartsWith; -import static org.apache.iceberg.expressions.Expressions.or; -import static org.apache.iceberg.expressions.Expressions.startsWith; -import static org.apache.parquet.internal.column.columnindex.BoundaryOrder.ASCENDING; -import static org.apache.parquet.internal.column.columnindex.BoundaryOrder.DESCENDING; -import static org.apache.parquet.internal.column.columnindex.BoundaryOrder.UNORDERED; -import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; -import static org.apache.parquet.schema.Types.optional; - public class TestColumnIndexFilter { - /** - * COPIED FROM org.apache.parquet.internal.filter2.columnindex.TestColumnIndexFilter - **/ + /** COPIED FROM org.apache.parquet.internal.filter2.columnindex.TestColumnIndexFilter */ private static class CIBuilder { private static final ByteBuffer EMPTY = ByteBuffer.wrap(new byte[0]); private final PrimitiveType type; @@ -116,8 +113,10 @@ CIBuilder addPage(long nullCount, byte[] min, byte[] max) { CIBuilder addPage(long nullCount, int min, int max) { nullPages.add(false); nullCounts.add(nullCount); - minValues.add(ByteBuffer.allocate(Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN).putInt(0, min)); - maxValues.add(ByteBuffer.allocate(Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN).putInt(0, max)); + minValues.add( + ByteBuffer.allocate(Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN).putInt(0, min)); + maxValues.add( + ByteBuffer.allocate(Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN).putInt(0, max)); return this; } @@ -145,13 +144,17 @@ CIBuilder addPage(long nullCount, double min, double max) { nullPages.add(false); nullCounts.add(nullCount); - minValues.add(ByteBuffer.allocate(Double.BYTES).order(ByteOrder.LITTLE_ENDIAN).putDouble(0, min)); - maxValues.add(ByteBuffer.allocate(Double.BYTES).order(ByteOrder.LITTLE_ENDIAN).putDouble(0, max)); + minValues.add( + ByteBuffer.allocate(Double.BYTES).order(ByteOrder.LITTLE_ENDIAN).putDouble(0, min)); + maxValues.add( + ByteBuffer.allocate(Double.BYTES).order(ByteOrder.LITTLE_ENDIAN).putDouble(0, max)); return this; } ColumnIndex build() { - return invalid ? null : ColumnIndexBuilder.build(type, order, nullPages, nullCounts, minValues, maxValues); + return invalid + ? null + : ColumnIndexBuilder.build(type, order, nullPages, nullCounts, minValues, maxValues); } } @@ -177,43 +180,51 @@ OffsetIndex build() { private static final String INT_DECIMAL_7_2 = "int_decimal_7_2"; private static final String NOT_IN_FILE = "not_in_file"; - private static final Schema SCHEMA = new Schema( - Types.NestedField.optional(1, INT_COL, Types.IntegerType.get()), - Types.NestedField.optional(2, STR_COL, Types.StringType.get()), - Types.NestedField.optional(3, NO_NANS, Types.DoubleType.get()), - Types.NestedField.optional(4, NO_CI, Types.DoubleType.get()), - Types.NestedField.optional(5, ALL_NULLS, Types.LongType.get()), - Types.NestedField.optional(6, INT_DECIMAL_7_2, Types.DecimalType.of(7, 2)), - Types.NestedField.optional(7, NOT_IN_FILE, Types.LongType.get()) - ); - - private static final MessageType FILE_SCHEMA = org.apache.parquet.schema.Types.buildMessage() - .addField(optional(INT32).id(1).named(INT_COL)) - .addField(optional(BINARY).id(2).as(LogicalTypeAnnotation.stringType()).id(2).named(STR_COL)) - .addField(optional(DOUBLE).id(3).named(NO_NANS)) - .addField(optional(DOUBLE).id(4).named(NO_CI)) - .addField(optional(INT64).id(5).named(ALL_NULLS)) - .addField(optional(INT32).id(6).as(LogicalTypeAnnotation.decimalType(2, 9)).named(INT_DECIMAL_7_2)) - .named("table"); - - private static final ColumnIndex INT_COL_CI = new CIBuilder(optional(INT32).named(INT_COL), ASCENDING) - .addPage(0, 1, 1) - .addPage(1, 2, 6) - .addPage(0, 7, 7) - .addPage(1, 7, 10) - .addPage(0, 11, 17) - .addPage(0, 18, 23) - .addPage(0, 24, 26) - .build(); - private static final OffsetIndex INT_COL_OI = new OIBuilder() - .addPage(1) - .addPage(6) - .addPage(2) - .addPage(5) - .addPage(7) - .addPage(6) - .addPage(3) - .build(); + private static final Schema SCHEMA = + new Schema( + Types.NestedField.optional(1, INT_COL, Types.IntegerType.get()), + Types.NestedField.optional(2, STR_COL, Types.StringType.get()), + Types.NestedField.optional(3, NO_NANS, Types.DoubleType.get()), + Types.NestedField.optional(4, NO_CI, Types.DoubleType.get()), + Types.NestedField.optional(5, ALL_NULLS, Types.LongType.get()), + Types.NestedField.optional(6, INT_DECIMAL_7_2, Types.DecimalType.of(7, 2)), + Types.NestedField.optional(7, NOT_IN_FILE, Types.LongType.get())); + + private static final MessageType FILE_SCHEMA = + org.apache.parquet.schema.Types.buildMessage() + .addField(optional(INT32).id(1).named(INT_COL)) + .addField( + optional(BINARY).id(2).as(LogicalTypeAnnotation.stringType()).id(2).named(STR_COL)) + .addField(optional(DOUBLE).id(3).named(NO_NANS)) + .addField(optional(DOUBLE).id(4).named(NO_CI)) + .addField(optional(INT64).id(5).named(ALL_NULLS)) + .addField( + optional(INT32) + .id(6) + .as(LogicalTypeAnnotation.decimalType(2, 9)) + .named(INT_DECIMAL_7_2)) + .named("table"); + + private static final ColumnIndex INT_COL_CI = + new CIBuilder(optional(INT32).named(INT_COL), ASCENDING) + .addPage(0, 1, 1) + .addPage(1, 2, 6) + .addPage(0, 7, 7) + .addPage(1, 7, 10) + .addPage(0, 11, 17) + .addPage(0, 18, 23) + .addPage(0, 24, 26) + .build(); + private static final OffsetIndex INT_COL_OI = + new OIBuilder() + .addPage(1) + .addPage(6) + .addPage(2) + .addPage(5) + .addPage(7) + .addPage(6) + .addPage(3) + .build(); private static final ColumnIndex STR_COL_CI = new CIBuilder(optional(BINARY).as(stringType()).named(STR_COL), DESCENDING) .addPage(0, "Zulu", "Zulu") @@ -225,119 +236,126 @@ OffsetIndex build() { .addPage(0, "Bravo", "India") .addPage(0, "Alfa", "Alfa") .build(); - private static final OffsetIndex STR_COL_OI = new OIBuilder() - .addPage(1) - .addPage(3) - .addPage(4) - .addPage(3) - .addPage(5) - .addPage(5) - .addPage(8) - .addPage(1) - .build(); - private static final ColumnIndex NO_NANS_CI = new CIBuilder(optional(DOUBLE).named(NO_NANS), UNORDERED) - .addPage(0, 2.03, 2.03) - .addPage(0, 0.56, 8.71) - .addPage(2, 3.14, 3.50) - .addPage(0, 2.71, 9.99) - .addPage(3, 0.36, 5.32) - .addPage(0, 4.17, 7.95) - .addNullPage(4) - .build(); - private static final OffsetIndex NO_NANS_OI = new OIBuilder() - .addPage(1) - .addPage(5) - .addPage(4) - .addPage(6) - .addPage(7) - .addPage(3) - .addPage(4) - .build(); + private static final OffsetIndex STR_COL_OI = + new OIBuilder() + .addPage(1) + .addPage(3) + .addPage(4) + .addPage(3) + .addPage(5) + .addPage(5) + .addPage(8) + .addPage(1) + .build(); + private static final ColumnIndex NO_NANS_CI = + new CIBuilder(optional(DOUBLE).named(NO_NANS), UNORDERED) + .addPage(0, 2.03, 2.03) + .addPage(0, 0.56, 8.71) + .addPage(2, 3.14, 3.50) + .addPage(0, 2.71, 9.99) + .addPage(3, 0.36, 5.32) + .addPage(0, 4.17, 7.95) + .addNullPage(4) + .build(); + private static final OffsetIndex NO_NANS_OI = + new OIBuilder() + .addPage(1) + .addPage(5) + .addPage(4) + .addPage(6) + .addPage(7) + .addPage(3) + .addPage(4) + .build(); private static final ColumnIndex NO_CI_CI = null; - private static final OffsetIndex NO_CI_OI = new OIBuilder() - .addPage(1) - .addPage(3) - .addPage(2) - .addPage(1) - .addPage(5) - .addPage(4) - .addPage(5) - .addPage(7) - .addPage(2) - .build(); - private static final ColumnIndex ALL_NULLS_CI = new CIBuilder(optional(INT64).named(ALL_NULLS), ASCENDING) - .addNullPage(1) - .addNullPage(29) - .build(); - private static final OffsetIndex ALL_NULLS_OI = new OIBuilder() - .addPage(1) - .addPage(29) - .build(); - private static final ColumnIndex INT_DECIMAL_7_2_CI = new CIBuilder(optional(INT32).named(INT_DECIMAL_7_2), UNORDERED) - .addPage(0, 99, 99) - .addPage(0, 100, 100) - .addPage(0, 101, 101) - .addPage(0, 98, 98) - .addPage(0, 99, 103) - .addNullPage(4) - .addPage(0, 100, 100) - .addPage(2, 87, 109) - .addNullPage(2) - .build(); - private static final OffsetIndex INT_DECIMAL_7_2_OI = new OIBuilder() - .addPage(1) - .addPage(3) - .addPage(2) - .addPage(1) - .addPage(5) - .addPage(4) - .addPage(5) - .addPage(7) - .addPage(2) - .build(); - private static final ColumnIndexStore STORE = new ColumnIndexStore() { - @Override - public ColumnIndex getColumnIndex(ColumnPath column) { - switch (column.toDotString()) { - case INT_COL: - return INT_COL_CI; - case STR_COL: - return STR_COL_CI; - case NO_NANS: - return NO_NANS_CI; - case NO_CI: - return NO_CI_CI; - case ALL_NULLS: - return ALL_NULLS_CI; - case INT_DECIMAL_7_2: - return INT_DECIMAL_7_2_CI; - default: - return null; - } - } + private static final OffsetIndex NO_CI_OI = + new OIBuilder() + .addPage(1) + .addPage(3) + .addPage(2) + .addPage(1) + .addPage(5) + .addPage(4) + .addPage(5) + .addPage(7) + .addPage(2) + .build(); + private static final ColumnIndex ALL_NULLS_CI = + new CIBuilder(optional(INT64).named(ALL_NULLS), ASCENDING) + .addNullPage(1) + .addNullPage(29) + .build(); + private static final OffsetIndex ALL_NULLS_OI = new OIBuilder().addPage(1).addPage(29).build(); + private static final ColumnIndex INT_DECIMAL_7_2_CI = + new CIBuilder(optional(INT32).named(INT_DECIMAL_7_2), UNORDERED) + .addPage(0, 99, 99) + .addPage(0, 100, 100) + .addPage(0, 101, 101) + .addPage(0, 98, 98) + .addPage(0, 99, 103) + .addNullPage(4) + .addPage(0, 100, 100) + .addPage(2, 87, 109) + .addNullPage(2) + .build(); + private static final OffsetIndex INT_DECIMAL_7_2_OI = + new OIBuilder() + .addPage(1) + .addPage(3) + .addPage(2) + .addPage(1) + .addPage(5) + .addPage(4) + .addPage(5) + .addPage(7) + .addPage(2) + .build(); + private static final ColumnIndexStore STORE = + new ColumnIndexStore() { + @Override + public ColumnIndex getColumnIndex(ColumnPath column) { + switch (column.toDotString()) { + case INT_COL: + return INT_COL_CI; + case STR_COL: + return STR_COL_CI; + case NO_NANS: + return NO_NANS_CI; + case NO_CI: + return NO_CI_CI; + case ALL_NULLS: + return ALL_NULLS_CI; + case INT_DECIMAL_7_2: + return INT_DECIMAL_7_2_CI; + default: + return null; + } + } - @Override - public OffsetIndex getOffsetIndex(ColumnPath column) { - switch (column.toDotString()) { - case INT_COL: - return INT_COL_OI; - case STR_COL: - return STR_COL_OI; - case NO_NANS: - return NO_NANS_OI; - case NO_CI: - return NO_CI_OI; - case ALL_NULLS: - return ALL_NULLS_OI; - case INT_DECIMAL_7_2: - return INT_DECIMAL_7_2_OI; - default: - throw new MissingOffsetIndexException(column); - } - } - }; + @Override + public OffsetIndex getOffsetIndex(ColumnPath column) { + switch (column.toDotString()) { + case INT_COL: + return INT_COL_OI; + case STR_COL: + return STR_COL_OI; + case NO_NANS: + return NO_NANS_OI; + case NO_CI: + return NO_CI_OI; + case ALL_NULLS: + return ALL_NULLS_OI; + case INT_DECIMAL_7_2: + return INT_DECIMAL_7_2_OI; + default: + throw new MissingOffsetIndexException(column); + } + } + }; /** + * + * *
    * row   int_col       str_col        no_nans        no_ci          all_nulls      int_decimal_7_2
    *                                                 (no column index)
@@ -391,29 +409,33 @@ public OffsetIndex getOffsetIndex(ColumnPath column) {
    * 29.  26             Alfa           null                          null           null
    * 
*/ - private static final RowRanges ALL_ROWS = PageSkippingHelpers.allRows(TOTAL_ROW_COUNT); + private static final RowRanges NO_ROWS = PageSkippingHelpers.empty(); private static RowRanges selectRowRanges(String path, int... pageIndexes) { return selectRowRanges(path, STORE, TOTAL_ROW_COUNT, pageIndexes); } - private static RowRanges selectRowRanges(String path, ColumnIndexStore store, long rowCount, int... pageIndexes) { - return PageSkippingHelpers.createRowRanges(rowCount, new PrimitiveIterator.OfInt() { - int index = -1; + private static RowRanges selectRowRanges( + String path, ColumnIndexStore store, long rowCount, int... pageIndexes) { + return PageSkippingHelpers.createRowRanges( + rowCount, + new PrimitiveIterator.OfInt() { + int index = -1; - @Override - public int nextInt() { - return pageIndexes[index]; - } + @Override + public int nextInt() { + return pageIndexes[index]; + } - @Override - public boolean hasNext() { - index += 1; - return index < pageIndexes.length; - } - }, store.getOffsetIndex(ColumnPath.fromDotString(path))); + @Override + public boolean hasNext() { + index += 1; + return index < pageIndexes.length; + } + }, + store.getOffsetIndex(ColumnPath.fromDotString(path))); } private boolean rowRangesEquals(RowRanges r1, RowRanges r2) { @@ -445,8 +467,8 @@ private boolean rowRangesEquals(RowRanges r1, RowRanges r2) { private void assertRowRangesEquals(RowRanges expected, RowRanges actual) { if (!rowRangesEquals(expected, actual)) { - throw new AssertionError(String.format("RowRanges are not equal, expected: %s, actual: %s", - expected, actual)); + throw new AssertionError( + String.format("RowRanges are not equal, expected: %s, actual: %s", expected, actual)); } } @@ -458,10 +480,15 @@ private RowRanges calculateRowRanges(Expression expr, boolean caseSensitive) { return calculateRowRanges(SCHEMA, FILE_SCHEMA, expr, caseSensitive, STORE, TOTAL_ROW_COUNT); } - private RowRanges calculateRowRanges(Schema schema, MessageType messageType, Expression expr, - boolean caseSensitive, ColumnIndexStore store, long rowCount) { + private RowRanges calculateRowRanges( + Schema schema, + MessageType messageType, + Expression expr, + boolean caseSensitive, + ColumnIndexStore store, + long rowCount) { return new ParquetColumnIndexFilter(schema, expr, caseSensitive) - .calculateRowRanges(messageType, store, rowCount); + .calculateRowRanges(messageType, store, rowCount); } @Test @@ -521,8 +548,10 @@ public void testNotNaN() { @Test public void testMissingColumn() { - Assert.assertThrows("Cannot find field 'missing'", - ValidationException.class, () -> calculateRowRanges(equal("missing", 0))); + Assert.assertThrows( + "Cannot find field 'missing'", + ValidationException.class, + () -> calculateRowRanges(equal("missing", 0))); } @Test @@ -547,7 +576,8 @@ public void testMissingColumnIndex() { @Test public void testNot() { - // ColumnIndexEvalVisitor does not support evaluating NOT expression, but NOT should be rewritten + // ColumnIndexEvalVisitor does not support evaluating NOT expression, but NOT should be + // rewritten RowRanges expected; expected = ALL_ROWS; @@ -573,7 +603,8 @@ public void testAnd() { assertRowRangesEquals(expected, calculateRowRanges(expr)); expr = and(equal(INT_COL, 2), equal(STR_COL, "Tango")); - expected = PageSkippingHelpers.intersection(selectRowRanges(INT_COL, 1), selectRowRanges(STR_COL, 2)); + expected = + PageSkippingHelpers.intersection(selectRowRanges(INT_COL, 1), selectRowRanges(STR_COL, 2)); assertRowRangesEquals(expected, calculateRowRanges(expr)); } @@ -713,8 +744,9 @@ public void testStringStartsWith() { assertRowRangesEquals(expected, calculateRowRanges(startsWith(STR_COL, "S"))); expected = selectRowRanges(STR_COL, 4, 6); - assertRowRangesEquals(expected, calculateRowRanges( - Expressions.or(startsWith(STR_COL, "Q"), startsWith(STR_COL, "G")))); + assertRowRangesEquals( + expected, + calculateRowRanges(Expressions.or(startsWith(STR_COL, "Q"), startsWith(STR_COL, "G")))); expected = selectRowRanges(STR_COL, 0); assertRowRangesEquals(expected, calculateRowRanges(startsWith(STR_COL, "Z"))); @@ -771,12 +803,14 @@ public void testIntTypePromotion() { expected = NO_ROWS; RowRanges actual = - calculateRowRanges(promotedLong, FILE_SCHEMA, equal(INT_COL, 0), true, STORE, TOTAL_ROW_COUNT); + calculateRowRanges( + promotedLong, FILE_SCHEMA, equal(INT_COL, 0), true, STORE, TOTAL_ROW_COUNT); assertRowRangesEquals(expected, actual); expected = selectRowRanges(INT_COL, 2, 3); actual = - calculateRowRanges(promotedLong, FILE_SCHEMA, equal(INT_COL, 7), true, STORE, TOTAL_ROW_COUNT); + calculateRowRanges( + promotedLong, FILE_SCHEMA, equal(INT_COL, 7), true, STORE, TOTAL_ROW_COUNT); assertRowRangesEquals(expected, actual); } @@ -784,13 +818,15 @@ public void testIntTypePromotion() { public void testMissingOffsetIndex() { RowRanges expected; - PrimitiveType missingOI = org.apache.parquet.schema.Types.primitive(INT32, Type.Repetition.REQUIRED) + PrimitiveType missingOI = + org.apache.parquet.schema.Types.primitive(INT32, Type.Repetition.REQUIRED) .id(1) .named("missing_oi"); MessageType messageType = new MessageType("table", missingOI); expected = ALL_ROWS; - RowRanges actual = calculateRowRanges(SCHEMA, messageType, equal(INT_COL, 1), true, STORE, TOTAL_ROW_COUNT); + RowRanges actual = + calculateRowRanges(SCHEMA, messageType, equal(INT_COL, 1), true, STORE, TOTAL_ROW_COUNT); assertRowRangesEquals(expected, actual); } @@ -803,8 +839,10 @@ public void testIntBackedDecimal() { assertRowRangesEquals(expected, calculateRowRanges(expr)); - expr = or(lessThan(INT_DECIMAL_7_2, new BigDecimal("1.00")), - greaterThan(INT_DECIMAL_7_2, new BigDecimal("1.01"))); + expr = + or( + lessThan(INT_DECIMAL_7_2, new BigDecimal("1.00")), + greaterThan(INT_DECIMAL_7_2, new BigDecimal("1.01"))); expected = selectRowRanges(INT_DECIMAL_7_2, 0, 3, 4, 7); assertRowRangesEquals(expected, calculateRowRanges(expr)); @@ -814,15 +852,17 @@ public void testIntBackedDecimal() { public void testDecimalTypePromotion() { RowRanges expected; - Schema promotedDecimal = new Schema(Types.NestedField.optional(6, INT_DECIMAL_7_2, Types.DecimalType.of(38, - 10))); + Schema promotedDecimal = + new Schema(Types.NestedField.optional(6, INT_DECIMAL_7_2, Types.DecimalType.of(38, 10))); Expression expr = equal(INT_DECIMAL_7_2, new BigDecimal("1.00")); expected = selectRowRanges(INT_DECIMAL_7_2, 1, 4, 6, 7); assertRowRangesEquals(expected, calculateRowRanges(expr)); - expr = or(lessThan(INT_DECIMAL_7_2, new BigDecimal("1.00")), - greaterThan(INT_DECIMAL_7_2, new BigDecimal("1.01"))); + expr = + or( + lessThan(INT_DECIMAL_7_2, new BigDecimal("1.00")), + greaterThan(INT_DECIMAL_7_2, new BigDecimal("1.01"))); expected = selectRowRanges(INT_DECIMAL_7_2, 0, 3, 4, 7); assertRowRangesEquals(expected, calculateRowRanges(expr)); @@ -841,54 +881,62 @@ public void testBinaryBackedDecimal() { String binaryDecimal = "decimal_38_10"; long rowCount = 9; - ColumnIndex binaryDecimalCI = new CIBuilder(optional(FIXED_LEN_BYTE_ARRAY) - .length(TypeUtil.decimalRequiredBytes(38)) - .named(binaryDecimal), ASCENDING) + ColumnIndex binaryDecimalCI = + new CIBuilder( + optional(FIXED_LEN_BYTE_ARRAY) + .length(TypeUtil.decimalRequiredBytes(38)) + .named(binaryDecimal), + ASCENDING) .addPage(0, decimalToBytes("12.34"), decimalToBytes("12.35")) .addPage(0, decimalToBytes("123456789.87654321"), decimalToBytes("123456789.87654323")) .build(); - OffsetIndex binaryDecimalOI = new OIBuilder() - .addPage(5) - .addPage(4) - .build(); - - ColumnIndexStore columnIndexStore = new ColumnIndexStore() { - @Override - public ColumnIndex getColumnIndex(ColumnPath columnPath) { - switch (columnPath.toDotString()) { - case "decimal_38_10": - return binaryDecimalCI; - default: - return null; - } - } - - @Override - public OffsetIndex getOffsetIndex(ColumnPath columnPath) { - switch (columnPath.toDotString()) { - case "decimal_38_10": - return binaryDecimalOI; - default: - throw new MissingOffsetIndexException(columnPath); - } - } - }; - - MessageType messageType = org.apache.parquet.schema.Types.buildMessage() - .addField(optional(FIXED_LEN_BYTE_ARRAY).length(TypeUtil.decimalRequiredBytes(38)).id(1).as(LogicalTypeAnnotation.decimalType(10, 38)).named(binaryDecimal)) - .named("decimal"); - - Schema schema = new Schema( - Types.NestedField.optional(1, binaryDecimal, Types.DecimalType.of(38, 10))); - - Expression expr = or( + OffsetIndex binaryDecimalOI = new OIBuilder().addPage(5).addPage(4).build(); + + ColumnIndexStore columnIndexStore = + new ColumnIndexStore() { + @Override + public ColumnIndex getColumnIndex(ColumnPath columnPath) { + switch (columnPath.toDotString()) { + case "decimal_38_10": + return binaryDecimalCI; + default: + return null; + } + } + + @Override + public OffsetIndex getOffsetIndex(ColumnPath columnPath) { + switch (columnPath.toDotString()) { + case "decimal_38_10": + return binaryDecimalOI; + default: + throw new MissingOffsetIndexException(columnPath); + } + } + }; + + MessageType messageType = + org.apache.parquet.schema.Types.buildMessage() + .addField( + optional(FIXED_LEN_BYTE_ARRAY) + .length(TypeUtil.decimalRequiredBytes(38)) + .id(1) + .as(LogicalTypeAnnotation.decimalType(10, 38)) + .named(binaryDecimal)) + .named("decimal"); + + Schema schema = + new Schema(Types.NestedField.optional(1, binaryDecimal, Types.DecimalType.of(38, 10))); + + Expression expr = + or( lessThan(binaryDecimal, new BigDecimal("12.34")), - greaterThanOrEqual(binaryDecimal, new BigDecimal("123456789.87654322")) - ); + greaterThanOrEqual(binaryDecimal, new BigDecimal("123456789.87654322"))); RowRanges expected = selectRowRanges(binaryDecimal, columnIndexStore, rowCount, 1); - RowRanges actual = calculateRowRanges(schema, messageType, expr, true, columnIndexStore, rowCount); + RowRanges actual = + calculateRowRanges(schema, messageType, expr, true, columnIndexStore, rowCount); assertRowRangesEquals(expected, actual); expr = greaterThan(binaryDecimal, new BigDecimal("123456789.87654323")); From 22cab3085d3f62937d0cace0f91de2703522b4e2 Mon Sep 17 00:00:00 2001 From: ZhongYujiang <42907416+zhongyujiang@users.noreply.github.com> Date: Wed, 1 Mar 2023 13:32:34 +0800 Subject: [PATCH 07/21] Support page skipping on row read path. --- .../parquet/VectorizedColumnIterator.java | 3 +- .../iceberg/parquet/BaseColumnIterator.java | 33 +- .../iceberg/parquet/BasePageIterator.java | 4 + .../iceberg/parquet/ColumnIterator.java | 59 +++- .../apache/iceberg/parquet/PageIterator.java | 7 +- .../org/apache/iceberg/parquet/Parquet.java | 9 +- .../apache/iceberg/parquet/ParquetReader.java | 22 +- .../iceberg/parquet/ParquetValueReader.java | 4 +- .../iceberg/parquet/ParquetValueReaders.java | 51 ++- .../org/apache/iceberg/parquet/ReadConf.java | 35 +- .../parquet/VectorizedParquetReader.java | 3 +- .../data/TestSparkParquetPageSkipping.java | 332 ++++++++++++++++++ 12 files changed, 525 insertions(+), 37 deletions(-) create mode 100644 spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetPageSkipping.java diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedColumnIterator.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedColumnIterator.java index 822ca8973f54..5e1e35df3fc9 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedColumnIterator.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/parquet/VectorizedColumnIterator.java @@ -18,6 +18,7 @@ */ package org.apache.iceberg.arrow.vectorized.parquet; +import java.util.Optional; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.IntVector; import org.apache.iceberg.arrow.vectorized.NullabilityHolder; @@ -55,7 +56,7 @@ public Dictionary setRowGroupInfo(PageReader store, boolean allPagesDictEncoded) // setPageSource can result in a data page read. If that happens, we need // to know in advance whether all the pages in the row group are dictionary encoded or not this.vectorizedPageIterator.setAllPagesDictEncoded(allPagesDictEncoded); - super.setPageSource(store); + super.setPageSource(store, Optional.empty()); return dictionary; } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java b/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java index 647397fad670..337e515fc549 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java @@ -18,14 +18,18 @@ */ package org.apache.iceberg.parquet; +import java.util.Optional; +import java.util.PrimitiveIterator; import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.Dictionary; import org.apache.parquet.column.page.DataPage; import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.internal.filter2.columnindex.RowRanges; @SuppressWarnings("checkstyle:VisibilityModifier") public abstract class BaseColumnIterator { protected final ColumnDescriptor desc; + protected final int definitionLevel; // state reset for each row group protected PageReader pageSource = null; @@ -34,24 +38,43 @@ public abstract class BaseColumnIterator { protected long advanceNextPageCount = 0L; protected Dictionary dictionary; + // state for page skipping + protected boolean synchronizing = false; + protected PrimitiveIterator.OfLong rowIndexes; + protected long targetRowIndex; + protected long currentRowIndex; + protected int skipValues; + protected BaseColumnIterator(ColumnDescriptor descriptor) { this.desc = descriptor; + this.definitionLevel = desc.getMaxDefinitionLevel() - 1; } - public void setPageSource(PageReader source) { + public void setPageSource(PageReader source, Optional rowRanges) { this.pageSource = source; this.triplesCount = source.getTotalValueCount(); this.triplesRead = 0L; this.advanceNextPageCount = 0L; + if (rowRanges.isPresent()) { + this.synchronizing = true; + this.rowIndexes = rowRanges.get().iterator(); + this.targetRowIndex = Long.MIN_VALUE; + } + BasePageIterator pageIterator = pageIterator(); pageIterator.reset(); dictionary = ParquetUtil.readDictionary(desc, pageSource); pageIterator.setDictionary(dictionary); advance(); + skip(); } protected abstract BasePageIterator pageIterator(); + protected void skip() { + throw new UnsupportedOperationException(); + } + protected void advance() { if (triplesRead >= advanceNextPageCount) { BasePageIterator pageIterator = pageIterator(); @@ -60,6 +83,14 @@ protected void advance() { if (page != null) { pageIterator.setPage(page); this.advanceNextPageCount += pageIterator.currentPageCount(); + + if (synchronizing) { + long firstRowIndex = page.getFirstRowIndex() + .orElseThrow(() -> + new IllegalArgumentException("Missing page first row index for synchronizing values")); + this.skipValues = 0; + this.currentRowIndex = firstRowIndex - 1; + } } else { return; } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/BasePageIterator.java b/parquet/src/main/java/org/apache/iceberg/parquet/BasePageIterator.java index 75989e8f649b..b948458fd6a3 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/BasePageIterator.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/BasePageIterator.java @@ -71,6 +71,10 @@ protected void reset() { this.hasNext = false; } + protected void skip(int skipValues) { + throw new UnsupportedOperationException(); + } + protected abstract void initDataReader( Encoding dataEncoding, ByteBufferInputStream in, int valueCount); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ColumnIterator.java b/parquet/src/main/java/org/apache/iceberg/parquet/ColumnIterator.java index 1c0ea4829eb8..f6fa64ec0d3c 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ColumnIterator.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ColumnIterator.java @@ -111,53 +111,98 @@ public int currentRepetitionLevel() { public boolean nextBoolean() { this.triplesRead += 1; advance(); - return pageIterator.nextBoolean(); + boolean value = pageIterator.nextBoolean(); + skip(); + return value; } @Override public int nextInteger() { this.triplesRead += 1; advance(); - return pageIterator.nextInteger(); + int value = pageIterator.nextInteger(); + skip(); + return value; } @Override public long nextLong() { this.triplesRead += 1; advance(); - return pageIterator.nextLong(); + long value = pageIterator.nextLong(); + skip(); + return value; } @Override public float nextFloat() { this.triplesRead += 1; advance(); - return pageIterator.nextFloat(); + float value = pageIterator.nextFloat(); + skip(); + return value; } @Override public double nextDouble() { this.triplesRead += 1; advance(); - return pageIterator.nextDouble(); + double value = pageIterator.nextDouble(); + skip(); + return value; } @Override public Binary nextBinary() { this.triplesRead += 1; advance(); - return pageIterator.nextBinary(); + Binary value = pageIterator.nextBinary(); + skip(); + return value; } @Override public N nextNull() { this.triplesRead += 1; advance(); - return pageIterator.nextNull(); + N value = pageIterator.nextNull(); + skip(); + return value; } @Override protected BasePageIterator pageIterator() { return pageIterator; } + + @Override + protected void skip() { + if (!synchronizing) { + return; + } + + skipValues = 0; + while (hasNext()) { + advance(); + if (pageIterator.currentRepetitionLevel() == 0) { + currentRowIndex += 1; + if (currentRowIndex > targetRowIndex) { + targetRowIndex = rowIndexes.hasNext() ? rowIndexes.nextLong() : Long.MAX_VALUE; + } + } + + if (currentRowIndex < targetRowIndex) { + triplesRead += 1; + if (pageIterator.currentDefinitionLevel() > definitionLevel) { + skipValues += 1; + } + + pageIterator.advance(); + } else { + break; + } + } + + pageIterator.skip(skipValues); + } } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/PageIterator.java b/parquet/src/main/java/org/apache/iceberg/parquet/PageIterator.java index 34383352bf68..a79445d25607 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/PageIterator.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/PageIterator.java @@ -189,7 +189,12 @@ public V nextNull() { return null; } - private void advance() { + @Override + protected void skip(int skipValues) { + values.skip(skipValues); + } + + protected void advance() { if (triplesRead < triplesCount) { this.currentDL = definitionLevels.nextInt(); this.currentRL = repetitionLevels.nextInt(); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java index d240c84b9e4d..877a3d7e4923 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java @@ -978,6 +978,7 @@ public static class ReadBuilder { private NameMapping nameMapping = null; private ByteBuffer fileEncryptionKey = null; private ByteBuffer fileAADPrefix = null; + private boolean useColumnIndexFilter = false; private ReadBuilder(InputFile file) { this.file = file; @@ -1020,6 +1021,11 @@ public ReadBuilder filter(Expression newFilter) { return this; } + public ReadBuilder useColumnIndexFilter(boolean newUseColumnIndexFilter) { + this.useColumnIndexFilter = newUseColumnIndexFilter; + return this; + } + public ReadBuilder readSupport(ReadSupport newFilterSupport) { this.readSupport = newFilterSupport; return this; @@ -1139,7 +1145,8 @@ public CloseableIterable build() { nameMapping, filter, reuseContainers, - caseSensitive); + caseSensitive, + useColumnIndexFilter); } } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java index c1d8b0ccbbad..b2e563f547c5 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java @@ -19,6 +19,7 @@ package org.apache.iceberg.parquet; import java.io.IOException; +import java.util.Optional; import java.util.function.Function; import org.apache.iceberg.Schema; import org.apache.iceberg.exceptions.RuntimeIOException; @@ -32,6 +33,7 @@ import org.apache.parquet.ParquetReadOptions; import org.apache.parquet.column.page.PageReadStore; import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.internal.filter2.columnindex.RowRanges; import org.apache.parquet.schema.MessageType; public class ParquetReader extends CloseableGroup implements CloseableIterable { @@ -43,6 +45,7 @@ public class ParquetReader extends CloseableGroup implements CloseableIterabl private final boolean reuseContainers; private final boolean caseSensitive; private final NameMapping nameMapping; + private final boolean useColumnIndexFilter; public ParquetReader( InputFile input, @@ -52,7 +55,8 @@ public ParquetReader( NameMapping nameMapping, Expression filter, boolean reuseContainers, - boolean caseSensitive) { + boolean caseSensitive, + boolean useColumnIndexFilter) { this.input = input; this.expectedSchema = expectedSchema; this.options = options; @@ -62,6 +66,7 @@ public ParquetReader( this.reuseContainers = reuseContainers; this.caseSensitive = caseSensitive; this.nameMapping = nameMapping; + this.useColumnIndexFilter = useColumnIndexFilter; } private ReadConf conf = null; @@ -79,7 +84,8 @@ private ReadConf init() { nameMapping, reuseContainers, caseSensitive, - null); + null, + useColumnIndexFilter); this.conf = readConf.copy(); return readConf; } @@ -100,6 +106,7 @@ private static class FileIterator implements CloseableIterator { private final long totalValues; private final boolean reuseContainers; private final long[] rowGroupsStartRowPos; + private final RowRanges[] rowRangesArr; private int nextRowGroup = 0; private long nextRowGroupStart = 0; @@ -113,6 +120,7 @@ private static class FileIterator implements CloseableIterator { this.totalValues = conf.totalValues(); this.reuseContainers = conf.reuseContainers(); this.rowGroupsStartRowPos = conf.startRowPositions(); + this.rowRangesArr = conf.rowRangesArr(); } @Override @@ -139,12 +147,16 @@ public T next() { private void advance() { while (shouldSkip[nextRowGroup]) { nextRowGroup += 1; - reader.skipNextRowGroup(); } PageReadStore pages; + Optional rowRanges = Optional.ofNullable(rowRangesArr[nextRowGroup]); try { - pages = reader.readNextRowGroup(); + if (rowRanges.isPresent()) { + pages = PageSkippingHelpers.internalReadFilteredRowGroup(reader, nextRowGroup, rowRanges.get()); + } else { + pages = reader.readRowGroup(nextRowGroup); + } } catch (IOException e) { throw new RuntimeIOException(e); } @@ -153,7 +165,7 @@ private void advance() { nextRowGroupStart += pages.getRowCount(); nextRowGroup += 1; - model.setPageSource(pages, rowPosition); + model.setPageSource(pages, rowPosition, rowRanges); } @Override diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReader.java index b6c2b5b70303..adc802ba1c01 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReader.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReader.java @@ -19,7 +19,9 @@ package org.apache.iceberg.parquet; import java.util.List; +import java.util.Optional; import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.internal.filter2.columnindex.RowRanges; public interface ParquetValueReader { T read(T reuse); @@ -28,5 +30,5 @@ public interface ParquetValueReader { List> columns(); - void setPageSource(PageReadStore pageStore, long rowPosition); + void setPageSource(PageReadStore pageStore, long rowPosition, Optional RowRanges); } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java index c1f76e7bdb9a..ee0f2e12189a 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java @@ -27,11 +27,14 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Optional; +import java.util.PrimitiveIterator; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.internal.filter2.columnindex.RowRanges; import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.Type; @@ -112,7 +115,7 @@ public List> columns() { } @Override - public void setPageSource(PageReadStore pageStore, long rowPosition) {} + public void setPageSource(PageReadStore pageStore, long rowPosition, Optional rowRanges) {} } static class ConstantReader implements ParquetValueReader { @@ -175,17 +178,17 @@ public List> columns() { } @Override - public void setPageSource(PageReadStore pageStore, long rowPosition) {} + public void setPageSource(PageReadStore pageStore, long rowPosition, Optional rowRanges) {} } static class PositionReader implements ParquetValueReader { private long rowOffset = -1; private long rowGroupStart; + private PrimitiveIterator.OfLong rowIndexes; @Override public Long read(Long reuse) { - rowOffset = rowOffset + 1; - return rowGroupStart + rowOffset; + return rowGroupStart + rowIndexes.nextLong(); } @Override @@ -199,9 +202,25 @@ public List> columns() { } @Override - public void setPageSource(PageReadStore pageStore, long rowPosition) { + public void setPageSource(PageReadStore pageStore, long rowPosition, Optional rowRanges) { this.rowGroupStart = rowPosition; this.rowOffset = -1; + if (rowRanges.isPresent()) { + this.rowIndexes = rowRanges.get().iterator(); + } else { + this.rowIndexes = new PrimitiveIterator.OfLong() { + @Override + public long nextLong() { + rowOffset = rowOffset + 1; + return rowOffset; + } + + @Override + public boolean hasNext() { + return false; + } + }; + } } } @@ -220,8 +239,8 @@ protected PrimitiveReader(ColumnDescriptor desc) { } @Override - public void setPageSource(PageReadStore pageStore, long rowPosition) { - column.setPageSource(pageStore.getPageReader(desc)); + public void setPageSource(PageReadStore pageStore, long rowPosition, Optional rowRanges) { + column.setPageSource(pageStore.getPageReader(desc), rowRanges); } @Override @@ -404,8 +423,8 @@ private static class OptionReader implements ParquetValueReader { } @Override - public void setPageSource(PageReadStore pageStore, long rowPosition) { - reader.setPageSource(pageStore, rowPosition); + public void setPageSource(PageReadStore pageStore, long rowPosition, Optional rowRanges) { + reader.setPageSource(pageStore, rowPosition, rowRanges); } @Override @@ -449,8 +468,8 @@ protected RepeatedReader( } @Override - public void setPageSource(PageReadStore pageStore, long rowPosition) { - reader.setPageSource(pageStore, rowPosition); + public void setPageSource(PageReadStore pageStore, long rowPosition, Optional rowRanges) { + reader.setPageSource(pageStore, rowPosition, rowRanges); } @Override @@ -568,9 +587,9 @@ protected RepeatedKeyValueReader( } @Override - public void setPageSource(PageReadStore pageStore, long rowPosition) { - keyReader.setPageSource(pageStore, rowPosition); - valueReader.setPageSource(pageStore, rowPosition); + public void setPageSource(PageReadStore pageStore, long rowPosition, Optional rowRanges) { + keyReader.setPageSource(pageStore, rowPosition, rowRanges); + valueReader.setPageSource(pageStore, rowPosition, rowRanges); } @Override @@ -726,9 +745,9 @@ protected StructReader(List types, List> readers) { } @Override - public final void setPageSource(PageReadStore pageStore, long rowPosition) { + public final void setPageSource(PageReadStore pageStore, long rowPosition, Optional rowRanges) { for (ParquetValueReader reader : readers) { - reader.setPageSource(pageStore, rowPosition); + reader.setPageSource(pageStore, rowPosition, rowRanges); } } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java b/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java index da91e4dfa56a..7d97877e3719 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java @@ -40,8 +40,12 @@ import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.hadoop.metadata.ColumnPath; +import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; +import org.apache.parquet.internal.filter2.columnindex.RowRanges; import org.apache.parquet.schema.MessageType; +import static org.apache.iceberg.parquet.PageSkippingHelpers.getColumnIndexStore; + /** * Configuration for Parquet readers. * @@ -60,11 +64,12 @@ class ReadConf { private final boolean reuseContainers; private final Integer batchSize; private final long[] startRowPositions; + private final RowRanges[] rowRangesArr; // List of column chunk metadata for each row group private final List> columnChunkMetaDataForRowGroups; - @SuppressWarnings("unchecked") + @SuppressWarnings({"unchecked", "CyclomaticComplexity"}) ReadConf( InputFile file, ParquetReadOptions options, @@ -75,7 +80,8 @@ class ReadConf { NameMapping nameMapping, boolean reuseContainers, boolean caseSensitive, - Integer bSize) { + Integer bSize, + boolean useColumnIndexFilter) { this.file = file; this.options = options; this.reader = newReader(file, options); @@ -96,6 +102,7 @@ class ReadConf { this.rowGroups = reader.getRowGroups(); this.shouldSkip = new boolean[rowGroups.size()]; this.startRowPositions = new long[rowGroups.size()]; + this.rowRangesArr = new RowRanges[rowGroups.size()]; // Fetch all row groups starting positions to compute the row offsets of the filtered row groups Map offsetToStartPos = generateOffsetToStartPos(expectedSchema); @@ -103,10 +110,14 @@ class ReadConf { ParquetMetricsRowGroupFilter statsFilter = null; ParquetDictionaryRowGroupFilter dictFilter = null; ParquetBloomRowGroupFilter bloomFilter = null; + ParquetColumnIndexFilter columnIndexFilter = null; if (filter != null) { statsFilter = new ParquetMetricsRowGroupFilter(expectedSchema, filter, caseSensitive); dictFilter = new ParquetDictionaryRowGroupFilter(expectedSchema, filter, caseSensitive); bloomFilter = new ParquetBloomRowGroupFilter(expectedSchema, filter, caseSensitive); + if (useColumnIndexFilter) { + columnIndexFilter = new ParquetColumnIndexFilter(expectedSchema, filter, caseSensitive); + } } long computedTotalValues = 0L; @@ -121,9 +132,22 @@ class ReadConf { typeWithIds, rowGroup, reader.getDictionaryReader(rowGroup)) && bloomFilter.shouldRead( typeWithIds, rowGroup, reader.getBloomFilterDataReader(rowGroup))); + + if (useColumnIndexFilter && filter != null && shouldRead) { + ColumnIndexStore columnIndexStore = getColumnIndexStore(reader, i); + RowRanges rowRanges = + columnIndexFilter.calculateRowRanges(typeWithIds, columnIndexStore, rowGroup.getRowCount()); + + if (rowRanges.getRanges().size() == 0) { + shouldRead = false; + } else if (rowRanges.rowCount() != rowGroup.getRowCount()) { + rowRangesArr[i] = rowRanges; + } + } + this.shouldSkip[i] = !shouldRead; if (shouldRead) { - computedTotalValues += rowGroup.getRowCount(); + computedTotalValues += rowRangesArr[i] == null ? rowGroup.getRowCount() : rowRangesArr[i].rowCount(); } } @@ -156,6 +180,7 @@ private ReadConf(ReadConf toCopy) { this.vectorizedModel = toCopy.vectorizedModel; this.columnChunkMetaDataForRowGroups = toCopy.columnChunkMetaDataForRowGroups; this.startRowPositions = toCopy.startRowPositions; + this.rowRangesArr = toCopy.rowRangesArr; } ParquetFileReader reader() { @@ -181,6 +206,10 @@ boolean[] shouldSkip() { return shouldSkip; } + RowRanges[] rowRangesArr() { + return rowRangesArr; + } + private Map generateOffsetToStartPos(Schema schema) { if (schema.findField(MetadataColumns.ROW_POSITION.fieldId()) == null) { return null; diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedParquetReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedParquetReader.java index 773e0f7a85d0..e2a61477c682 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedParquetReader.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedParquetReader.java @@ -87,7 +87,8 @@ private ReadConf init() { nameMapping, reuseContainers, caseSensitive, - batchSize); + batchSize, + false); this.conf = readConf.copy(); return readConf; } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetPageSkipping.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetPageSkipping.java new file mode 100644 index 000000000000..a00c76ac35d3 --- /dev/null +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetPageSkipping.java @@ -0,0 +1,332 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.spark.data; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; +import org.apache.avro.generic.GenericData; +import org.apache.iceberg.Files; +import org.apache.iceberg.Schema; +import org.apache.iceberg.common.DynFields; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.parquet.Parquet; +import org.apache.iceberg.parquet.ParquetAvroWriter; +import org.apache.iceberg.relocated.com.google.common.base.Function; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.Pair; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.vectorized.ColumnarBatch; +import org.junit.Assert; +import org.junit.Assume; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import static org.apache.iceberg.TableProperties.PARQUET_DICT_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.PARQUET_PAGE_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + +@RunWith(Parameterized.class) +public class TestSparkParquetPageSkipping { + + private static final Types.StructType PRIMITIVES = Types.StructType.of( + required(0, "_long", Types.LongType.get()), + optional(1, "_string", Types.StringType.get()), // var width + required(2, "_bool", Types.BooleanType.get()), + optional(3, "_int", Types.IntegerType.get()), + optional(4, "_float", Types.FloatType.get()), + required(5, "_double", Types.DoubleType.get()), + optional(6, "_date", Types.DateType.get()), + required(7, "_ts", Types.TimestampType.withZone()), + required(8, "_fixed", Types.FixedType.ofLength(7)), + optional(9, "_bytes", Types.BinaryType.get()), // var width + required(10, "_dec_9_0", Types.DecimalType.of(9, 0)), // int + required(11, "_dec_11_2", Types.DecimalType.of(11, 2)), // long + required(12, "_dec_38_10", Types.DecimalType.of(38, 10)) // fixed + ); + + private static final Schema PRIMITIVES_SCHEMA = new Schema(PRIMITIVES.fields()); + + private static final Types.StructType LIST = Types.StructType.of( + optional(13, "_list", Types.ListType.ofOptional(14, Types.StringType.get()))); + private static final Types.StructType MAP = Types.StructType.of( + optional(15, "_map", Types.MapType.ofOptional(16, 17, + Types.StringType.get(), + Types.StringType.get()))); + private static final Schema COMPLEX_SCHEMA = new Schema( + Lists.newArrayList( + Iterables.concat(PRIMITIVES.fields(), LIST.fields(), MAP.fields()))); + + @Rule + public TemporaryFolder temp = new TemporaryFolder(); + + private File testFile; + private List allRecords = Lists.newArrayList(); + +/* Column and offset indexes info of `_long` column in `testFile` printed by parquet-cli's column-index command: + row-group 0: + column index for column _long: + Boudary order: ASCENDING + null count min max + page-0 0 0 56 + page-1 0 57 113 + page-2 0 114 170 + page-3 0 171 227 + page-4 0 228 284 + page-5 0 285 341 + page-6 0 342 398 + page-7 0 399 455 + page-8 0 456 512 + page-9 0 513 569 + page-10 0 570 592 + + offset index for column _long: + offset compressed size first row index + page-0 4 137 0 + page-1 141 138 57 + page-2 279 137 114 + page-3 416 138 171 + page-4 554 137 228 + page-5 691 141 285 + page-6 832 140 342 + page-7 972 141 399 + page-8 1113 141 456 + page-9 1254 140 513 + page-10 1394 92 570 + + + row-group 1: + column index for column _long: + Boudary order: ASCENDING + null count min max + page-0 0 593 649 + page-1 0 650 706 + page-2 0 707 763 + page-3 0 764 820 + page-4 0 821 877 + page-5 0 878 934 + page-6 0 935 991 + page-7 0 992 999 + + offset index for column _long: + offset compressed size first row index + page-0 498681 140 0 + page-1 498821 140 57 + page-2 498961 141 114 + page-3 499102 141 171 + page-4 499243 141 228 + page-5 499384 140 285 + page-6 499524 142 342 + page-7 499666 68 399 + */ + + private long index = -1; + private final static int ABOVE_INT_COL_MAX_VALUE = Integer.MAX_VALUE; + + @Before + public void generateFile() throws IOException { + testFile = temp.newFile(); + Assert.assertTrue("Delete should succeed", testFile.delete()); + + Function transform = record -> { + index += 1; + if (record.get("_long") != null) { + record.put("_long", index); + } + + if (Objects.equals(record.get("_int"), ABOVE_INT_COL_MAX_VALUE)) { + record.put("_int", ABOVE_INT_COL_MAX_VALUE - 1); + } + + return record; + }; + + int numRecords = 1000; + allRecords = RandomData.generateList(COMPLEX_SCHEMA, numRecords, 0) + .stream() + .map(transform) + .collect(Collectors.toList()); + + try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) + .createWriterFunc(ParquetAvroWriter::buildWriter) + .schema(COMPLEX_SCHEMA) + .set(PARQUET_PAGE_SIZE_BYTES, "500") + .set(PARQUET_ROW_GROUP_SIZE_BYTES, "500000") // 2 row groups + .set(PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT, "1") + .set(PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT, "1") + .set(PARQUET_DICT_SIZE_BYTES, "1") + .named("pages_unaligned_file") + .build()) { + writer.addAll(allRecords); + } + } + + @Parameterized.Parameters(name = "vectorized = {0}") + public static Object[] parameters() { + return new Object[] { false }; + } + + private final boolean vectorized; + + public TestSparkParquetPageSkipping(boolean vectorized) { + this.vectorized = vectorized; + } + + @Test + public void testSinglePageMatch() { + Expression filter = Expressions.and( + Expressions.greaterThanOrEqual("_long", 57), + Expressions.lessThan("_long", 114)); // exactly page-1 -> row ranges: [57, 113] + + List expected = selectRecords(allRecords, Pair.of(57, 114)); + readAndValidate(filter, expected); + } + + @Test + public void testMultiplePagesMatch() { + Expression filter = Expressions.or( + // page-1 -> row ranges: [57, 113] + Expressions.and( + Expressions.greaterThanOrEqual("_long", 57), + Expressions.lessThan("_long", 114)), + + // page-3, page-4 in row group 0 -> row ranges[171, 284] + Expressions.and( + Expressions.greaterThanOrEqual("_long", 173), + Expressions.lessThan("_long", 260)) + ); + + List expected = selectRecords(allRecords, Pair.of(57, 114), Pair.of(171, 285)); + readAndValidate(filter, expected); + } + + @Test + public void testMultipleRowGroupsMatch() { + Expression filter = Expressions.or( + // page-1 -> row ranges: [57, 113] + Expressions.and( + Expressions.greaterThanOrEqual("_long", 57), + Expressions.lessThan("_long", 114)), + + // page-3, page-4 in row group 0 -> row ranges[171, 284] + Expressions.and( + Expressions.greaterThanOrEqual("_long", 173), + Expressions.lessThan("_long", 260)) + ); + + filter = Expressions.or( + filter, + // page-10 in row group 0 and page-0, page-1 in row group 1 -> row ranges: [570, 706] + Expressions.and( + Expressions.greaterThanOrEqual("_long", 572), + Expressions.lessThan("_long", 663)) + ); + + List expected = selectRecords(allRecords, + Pair.of(57, 114), Pair.of(171, 285), Pair.of(570, 707)); + readAndValidate(filter, expected); + } + + @Test + public void testNoRowsMatch() { + Expression filter = Expressions.and( + Expressions.and( + Expressions.greaterThan("_long", 40), + Expressions.lessThan("_long", 46)), + Expressions.equal("_int", ABOVE_INT_COL_MAX_VALUE)); + + readAndValidate(filter, ImmutableList.of()); + } + + @Test + public void testAllRowsMatch() { + Expression filter = Expressions.greaterThanOrEqual("_long", Long.MIN_VALUE); + readAndValidate(filter, allRecords); + } + + private Schema readSchema() { + return vectorized ? PRIMITIVES_SCHEMA : COMPLEX_SCHEMA; + } + + private void readAndValidate(Expression filter, List expected) { + Schema projected = readSchema(); + + Parquet.ReadBuilder builder = Parquet.read(Files.localInput(testFile)) + .project(projected) + .filter(filter) + .useColumnIndexFilter(true); + + Types.StructType struct = projected.asStruct(); + + if (vectorized) { + CloseableIterable batches = builder.createBatchedReaderFunc( + type -> + VectorizedSparkParquetReaders.buildReader(projected, type, true)) + .build(); + + Iterator expectedIterator = expected.iterator(); + for (ColumnarBatch batch : batches) { + TestHelpers.assertEqualsBatch(struct, expectedIterator, batch, true); + } + + Assert.assertFalse("The expected records is more than the actual result", expectedIterator.hasNext()); + } else { + CloseableIterable reader = builder.createReaderFunc( + type -> SparkParquetReaders.buildReader(projected, type)) + .build(); + CloseableIterator actualRows = reader.iterator(); + + for (GenericData.Record record : expected) { + Assert.assertTrue("Should have expected number of rows", actualRows.hasNext()); + TestHelpers.assertEqualsUnsafe(struct, record, actualRows.next()); + } + + Assert.assertFalse("Should not have extra rows", actualRows.hasNext()); + } + } + + private List selectRecords(List records, Pair... ranges) { + return Arrays.stream(ranges) + .map(range -> records.subList(range.first(), range.second())) + .flatMap(Collection::stream) + .collect(Collectors.toList()); + } +} From f71a4a5cc71d2173de4439ce3ab4f010f2318c8c Mon Sep 17 00:00:00 2001 From: ZhongYujiang <42907416+zhongyujiang@users.noreply.github.com> Date: Wed, 1 Mar 2023 13:36:17 +0800 Subject: [PATCH 08/21] Spotless. --- .../iceberg/parquet/BaseColumnIterator.java | 9 +- .../apache/iceberg/parquet/ParquetReader.java | 4 +- .../iceberg/parquet/ParquetValueReaders.java | 49 ++-- .../org/apache/iceberg/parquet/ReadConf.java | 10 +- .../data/TestSparkParquetPageSkipping.java | 246 +++++++++--------- 5 files changed, 169 insertions(+), 149 deletions(-) diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java b/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java index 337e515fc549..88531519e977 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java @@ -85,9 +85,12 @@ protected void advance() { this.advanceNextPageCount += pageIterator.currentPageCount(); if (synchronizing) { - long firstRowIndex = page.getFirstRowIndex() - .orElseThrow(() -> - new IllegalArgumentException("Missing page first row index for synchronizing values")); + long firstRowIndex = + page.getFirstRowIndex() + .orElseThrow( + () -> + new IllegalArgumentException( + "Missing page first row index for synchronizing values")); this.skipValues = 0; this.currentRowIndex = firstRowIndex - 1; } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java index b2e563f547c5..bb9fd6fb423b 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java @@ -153,7 +153,9 @@ private void advance() { Optional rowRanges = Optional.ofNullable(rowRangesArr[nextRowGroup]); try { if (rowRanges.isPresent()) { - pages = PageSkippingHelpers.internalReadFilteredRowGroup(reader, nextRowGroup, rowRanges.get()); + pages = + PageSkippingHelpers.internalReadFilteredRowGroup( + reader, nextRowGroup, rowRanges.get()); } else { pages = reader.readRowGroup(nextRowGroup); } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java index ee0f2e12189a..7bb5d8537436 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java @@ -115,7 +115,8 @@ public List> columns() { } @Override - public void setPageSource(PageReadStore pageStore, long rowPosition, Optional rowRanges) {} + public void setPageSource( + PageReadStore pageStore, long rowPosition, Optional rowRanges) {} } static class ConstantReader implements ParquetValueReader { @@ -178,7 +179,8 @@ public List> columns() { } @Override - public void setPageSource(PageReadStore pageStore, long rowPosition, Optional rowRanges) {} + public void setPageSource( + PageReadStore pageStore, long rowPosition, Optional rowRanges) {} } static class PositionReader implements ParquetValueReader { @@ -202,24 +204,26 @@ public List> columns() { } @Override - public void setPageSource(PageReadStore pageStore, long rowPosition, Optional rowRanges) { + public void setPageSource( + PageReadStore pageStore, long rowPosition, Optional rowRanges) { this.rowGroupStart = rowPosition; this.rowOffset = -1; if (rowRanges.isPresent()) { this.rowIndexes = rowRanges.get().iterator(); } else { - this.rowIndexes = new PrimitiveIterator.OfLong() { - @Override - public long nextLong() { - rowOffset = rowOffset + 1; - return rowOffset; - } - - @Override - public boolean hasNext() { - return false; - } - }; + this.rowIndexes = + new PrimitiveIterator.OfLong() { + @Override + public long nextLong() { + rowOffset = rowOffset + 1; + return rowOffset; + } + + @Override + public boolean hasNext() { + return false; + } + }; } } } @@ -239,7 +243,8 @@ protected PrimitiveReader(ColumnDescriptor desc) { } @Override - public void setPageSource(PageReadStore pageStore, long rowPosition, Optional rowRanges) { + public void setPageSource( + PageReadStore pageStore, long rowPosition, Optional rowRanges) { column.setPageSource(pageStore.getPageReader(desc), rowRanges); } @@ -423,7 +428,8 @@ private static class OptionReader implements ParquetValueReader { } @Override - public void setPageSource(PageReadStore pageStore, long rowPosition, Optional rowRanges) { + public void setPageSource( + PageReadStore pageStore, long rowPosition, Optional rowRanges) { reader.setPageSource(pageStore, rowPosition, rowRanges); } @@ -468,7 +474,8 @@ protected RepeatedReader( } @Override - public void setPageSource(PageReadStore pageStore, long rowPosition, Optional rowRanges) { + public void setPageSource( + PageReadStore pageStore, long rowPosition, Optional rowRanges) { reader.setPageSource(pageStore, rowPosition, rowRanges); } @@ -587,7 +594,8 @@ protected RepeatedKeyValueReader( } @Override - public void setPageSource(PageReadStore pageStore, long rowPosition, Optional rowRanges) { + public void setPageSource( + PageReadStore pageStore, long rowPosition, Optional rowRanges) { keyReader.setPageSource(pageStore, rowPosition, rowRanges); valueReader.setPageSource(pageStore, rowPosition, rowRanges); } @@ -745,7 +753,8 @@ protected StructReader(List types, List> readers) { } @Override - public final void setPageSource(PageReadStore pageStore, long rowPosition, Optional rowRanges) { + public final void setPageSource( + PageReadStore pageStore, long rowPosition, Optional rowRanges) { for (ParquetValueReader reader : readers) { reader.setPageSource(pageStore, rowPosition, rowRanges); } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java b/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java index 7d97877e3719..aaaba05b28df 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java @@ -18,6 +18,8 @@ */ package org.apache.iceberg.parquet; +import static org.apache.iceberg.parquet.PageSkippingHelpers.getColumnIndexStore; + import java.io.IOException; import java.io.UncheckedIOException; import java.util.List; @@ -44,8 +46,6 @@ import org.apache.parquet.internal.filter2.columnindex.RowRanges; import org.apache.parquet.schema.MessageType; -import static org.apache.iceberg.parquet.PageSkippingHelpers.getColumnIndexStore; - /** * Configuration for Parquet readers. * @@ -136,7 +136,8 @@ class ReadConf { if (useColumnIndexFilter && filter != null && shouldRead) { ColumnIndexStore columnIndexStore = getColumnIndexStore(reader, i); RowRanges rowRanges = - columnIndexFilter.calculateRowRanges(typeWithIds, columnIndexStore, rowGroup.getRowCount()); + columnIndexFilter.calculateRowRanges( + typeWithIds, columnIndexStore, rowGroup.getRowCount()); if (rowRanges.getRanges().size() == 0) { shouldRead = false; @@ -147,7 +148,8 @@ class ReadConf { this.shouldSkip[i] = !shouldRead; if (shouldRead) { - computedTotalValues += rowRangesArr[i] == null ? rowGroup.getRowCount() : rowRangesArr[i].rowCount(); + computedTotalValues += + rowRangesArr[i] == null ? rowGroup.getRowCount() : rowRangesArr[i].rowCount(); } } diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetPageSkipping.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetPageSkipping.java index a00c76ac35d3..cfdc33c9552c 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetPageSkipping.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetPageSkipping.java @@ -16,9 +16,16 @@ * specific language governing permissions and limitations * under the License. */ - package org.apache.iceberg.spark.data; +import static org.apache.iceberg.TableProperties.PARQUET_DICT_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.PARQUET_PAGE_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.apache.iceberg.types.Types.NestedField.required; + import java.io.File; import java.io.IOException; import java.util.Arrays; @@ -30,7 +37,6 @@ import org.apache.avro.generic.GenericData; import org.apache.iceberg.Files; import org.apache.iceberg.Schema; -import org.apache.iceberg.common.DynFields; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.io.CloseableIterable; @@ -48,7 +54,6 @@ import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.vectorized.ColumnarBatch; import org.junit.Assert; -import org.junit.Assume; import org.junit.Before; import org.junit.Rule; import org.junit.Test; @@ -56,52 +61,47 @@ import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import static org.apache.iceberg.TableProperties.PARQUET_DICT_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.PARQUET_PAGE_SIZE_BYTES; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT; -import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; -import static org.apache.iceberg.types.Types.NestedField.optional; -import static org.apache.iceberg.types.Types.NestedField.required; - @RunWith(Parameterized.class) public class TestSparkParquetPageSkipping { - private static final Types.StructType PRIMITIVES = Types.StructType.of( - required(0, "_long", Types.LongType.get()), - optional(1, "_string", Types.StringType.get()), // var width - required(2, "_bool", Types.BooleanType.get()), - optional(3, "_int", Types.IntegerType.get()), - optional(4, "_float", Types.FloatType.get()), - required(5, "_double", Types.DoubleType.get()), - optional(6, "_date", Types.DateType.get()), - required(7, "_ts", Types.TimestampType.withZone()), - required(8, "_fixed", Types.FixedType.ofLength(7)), - optional(9, "_bytes", Types.BinaryType.get()), // var width - required(10, "_dec_9_0", Types.DecimalType.of(9, 0)), // int - required(11, "_dec_11_2", Types.DecimalType.of(11, 2)), // long - required(12, "_dec_38_10", Types.DecimalType.of(38, 10)) // fixed - ); + private static final Types.StructType PRIMITIVES = + Types.StructType.of( + required(0, "_long", Types.LongType.get()), + optional(1, "_string", Types.StringType.get()), // var width + required(2, "_bool", Types.BooleanType.get()), + optional(3, "_int", Types.IntegerType.get()), + optional(4, "_float", Types.FloatType.get()), + required(5, "_double", Types.DoubleType.get()), + optional(6, "_date", Types.DateType.get()), + required(7, "_ts", Types.TimestampType.withZone()), + required(8, "_fixed", Types.FixedType.ofLength(7)), + optional(9, "_bytes", Types.BinaryType.get()), // var width + required(10, "_dec_9_0", Types.DecimalType.of(9, 0)), // int + required(11, "_dec_11_2", Types.DecimalType.of(11, 2)), // long + required(12, "_dec_38_10", Types.DecimalType.of(38, 10)) // fixed + ); private static final Schema PRIMITIVES_SCHEMA = new Schema(PRIMITIVES.fields()); - private static final Types.StructType LIST = Types.StructType.of( - optional(13, "_list", Types.ListType.ofOptional(14, Types.StringType.get()))); - private static final Types.StructType MAP = Types.StructType.of( - optional(15, "_map", Types.MapType.ofOptional(16, 17, - Types.StringType.get(), - Types.StringType.get()))); - private static final Schema COMPLEX_SCHEMA = new Schema( - Lists.newArrayList( - Iterables.concat(PRIMITIVES.fields(), LIST.fields(), MAP.fields()))); - - @Rule - public TemporaryFolder temp = new TemporaryFolder(); + private static final Types.StructType LIST = + Types.StructType.of( + optional(13, "_list", Types.ListType.ofOptional(14, Types.StringType.get()))); + private static final Types.StructType MAP = + Types.StructType.of( + optional( + 15, + "_map", + Types.MapType.ofOptional(16, 17, Types.StringType.get(), Types.StringType.get()))); + private static final Schema COMPLEX_SCHEMA = + new Schema( + Lists.newArrayList(Iterables.concat(PRIMITIVES.fields(), LIST.fields(), MAP.fields()))); + + @Rule public TemporaryFolder temp = new TemporaryFolder(); private File testFile; private List allRecords = Lists.newArrayList(); -/* Column and offset indexes info of `_long` column in `testFile` printed by parquet-cli's column-index command: + /* Column and offset indexes info of `_long` column in `testFile` printed by parquet-cli's column-index command: row-group 0: column index for column _long: Boudary order: ASCENDING @@ -159,49 +159,51 @@ public class TestSparkParquetPageSkipping { */ private long index = -1; - private final static int ABOVE_INT_COL_MAX_VALUE = Integer.MAX_VALUE; + private static final int ABOVE_INT_COL_MAX_VALUE = Integer.MAX_VALUE; @Before public void generateFile() throws IOException { testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); - Function transform = record -> { - index += 1; - if (record.get("_long") != null) { - record.put("_long", index); - } + Function transform = + record -> { + index += 1; + if (record.get("_long") != null) { + record.put("_long", index); + } - if (Objects.equals(record.get("_int"), ABOVE_INT_COL_MAX_VALUE)) { - record.put("_int", ABOVE_INT_COL_MAX_VALUE - 1); - } + if (Objects.equals(record.get("_int"), ABOVE_INT_COL_MAX_VALUE)) { + record.put("_int", ABOVE_INT_COL_MAX_VALUE - 1); + } - return record; - }; + return record; + }; int numRecords = 1000; - allRecords = RandomData.generateList(COMPLEX_SCHEMA, numRecords, 0) - .stream() - .map(transform) - .collect(Collectors.toList()); - - try (FileAppender writer = Parquet.write(Files.localOutput(testFile)) - .createWriterFunc(ParquetAvroWriter::buildWriter) - .schema(COMPLEX_SCHEMA) - .set(PARQUET_PAGE_SIZE_BYTES, "500") - .set(PARQUET_ROW_GROUP_SIZE_BYTES, "500000") // 2 row groups - .set(PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT, "1") - .set(PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT, "1") - .set(PARQUET_DICT_SIZE_BYTES, "1") - .named("pages_unaligned_file") - .build()) { + allRecords = + RandomData.generateList(COMPLEX_SCHEMA, numRecords, 0).stream() + .map(transform) + .collect(Collectors.toList()); + + try (FileAppender writer = + Parquet.write(Files.localOutput(testFile)) + .createWriterFunc(ParquetAvroWriter::buildWriter) + .schema(COMPLEX_SCHEMA) + .set(PARQUET_PAGE_SIZE_BYTES, "500") + .set(PARQUET_ROW_GROUP_SIZE_BYTES, "500000") // 2 row groups + .set(PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT, "1") + .set(PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT, "1") + .set(PARQUET_DICT_SIZE_BYTES, "1") + .named("pages_unaligned_file") + .build()) { writer.addAll(allRecords); } } @Parameterized.Parameters(name = "vectorized = {0}") public static Object[] parameters() { - return new Object[] { false }; + return new Object[] {false}; } private final boolean vectorized; @@ -212,9 +214,10 @@ public TestSparkParquetPageSkipping(boolean vectorized) { @Test public void testSinglePageMatch() { - Expression filter = Expressions.and( - Expressions.greaterThanOrEqual("_long", 57), - Expressions.lessThan("_long", 114)); // exactly page-1 -> row ranges: [57, 113] + Expression filter = + Expressions.and( + Expressions.greaterThanOrEqual("_long", 57), + Expressions.lessThan("_long", 114)); // exactly page-1 -> row ranges: [57, 113] List expected = selectRecords(allRecords, Pair.of(57, 114)); readAndValidate(filter, expected); @@ -222,56 +225,52 @@ public void testSinglePageMatch() { @Test public void testMultiplePagesMatch() { - Expression filter = Expressions.or( - // page-1 -> row ranges: [57, 113] - Expressions.and( - Expressions.greaterThanOrEqual("_long", 57), - Expressions.lessThan("_long", 114)), - - // page-3, page-4 in row group 0 -> row ranges[171, 284] - Expressions.and( - Expressions.greaterThanOrEqual("_long", 173), - Expressions.lessThan("_long", 260)) - ); - - List expected = selectRecords(allRecords, Pair.of(57, 114), Pair.of(171, 285)); + Expression filter = + Expressions.or( + // page-1 -> row ranges: [57, 113] + Expressions.and( + Expressions.greaterThanOrEqual("_long", 57), Expressions.lessThan("_long", 114)), + + // page-3, page-4 in row group 0 -> row ranges[171, 284] + Expressions.and( + Expressions.greaterThanOrEqual("_long", 173), Expressions.lessThan("_long", 260))); + + List expected = + selectRecords(allRecords, Pair.of(57, 114), Pair.of(171, 285)); readAndValidate(filter, expected); } @Test public void testMultipleRowGroupsMatch() { - Expression filter = Expressions.or( - // page-1 -> row ranges: [57, 113] - Expressions.and( - Expressions.greaterThanOrEqual("_long", 57), - Expressions.lessThan("_long", 114)), - - // page-3, page-4 in row group 0 -> row ranges[171, 284] - Expressions.and( - Expressions.greaterThanOrEqual("_long", 173), - Expressions.lessThan("_long", 260)) - ); - - filter = Expressions.or( - filter, - // page-10 in row group 0 and page-0, page-1 in row group 1 -> row ranges: [570, 706] - Expressions.and( - Expressions.greaterThanOrEqual("_long", 572), - Expressions.lessThan("_long", 663)) - ); - - List expected = selectRecords(allRecords, - Pair.of(57, 114), Pair.of(171, 285), Pair.of(570, 707)); + Expression filter = + Expressions.or( + // page-1 -> row ranges: [57, 113] + Expressions.and( + Expressions.greaterThanOrEqual("_long", 57), Expressions.lessThan("_long", 114)), + + // page-3, page-4 in row group 0 -> row ranges[171, 284] + Expressions.and( + Expressions.greaterThanOrEqual("_long", 173), Expressions.lessThan("_long", 260))); + + filter = + Expressions.or( + filter, + // page-10 in row group 0 and page-0, page-1 in row group 1 -> row ranges: [570, 706] + Expressions.and( + Expressions.greaterThanOrEqual("_long", 572), Expressions.lessThan("_long", 663))); + + List expected = + selectRecords(allRecords, Pair.of(57, 114), Pair.of(171, 285), Pair.of(570, 707)); readAndValidate(filter, expected); } @Test public void testNoRowsMatch() { - Expression filter = Expressions.and( + Expression filter = Expressions.and( - Expressions.greaterThan("_long", 40), - Expressions.lessThan("_long", 46)), - Expressions.equal("_int", ABOVE_INT_COL_MAX_VALUE)); + Expressions.and( + Expressions.greaterThan("_long", 40), Expressions.lessThan("_long", 46)), + Expressions.equal("_int", ABOVE_INT_COL_MAX_VALUE)); readAndValidate(filter, ImmutableList.of()); } @@ -289,29 +288,33 @@ private Schema readSchema() { private void readAndValidate(Expression filter, List expected) { Schema projected = readSchema(); - Parquet.ReadBuilder builder = Parquet.read(Files.localInput(testFile)) - .project(projected) - .filter(filter) - .useColumnIndexFilter(true); + Parquet.ReadBuilder builder = + Parquet.read(Files.localInput(testFile)) + .project(projected) + .filter(filter) + .useColumnIndexFilter(true); Types.StructType struct = projected.asStruct(); if (vectorized) { - CloseableIterable batches = builder.createBatchedReaderFunc( - type -> - VectorizedSparkParquetReaders.buildReader(projected, type, true)) - .build(); + CloseableIterable batches = + builder + .createBatchedReaderFunc( + type -> VectorizedSparkParquetReaders.buildReader(projected, type, true)) + .build(); Iterator expectedIterator = expected.iterator(); for (ColumnarBatch batch : batches) { TestHelpers.assertEqualsBatch(struct, expectedIterator, batch, true); } - Assert.assertFalse("The expected records is more than the actual result", expectedIterator.hasNext()); + Assert.assertFalse( + "The expected records is more than the actual result", expectedIterator.hasNext()); } else { - CloseableIterable reader = builder.createReaderFunc( - type -> SparkParquetReaders.buildReader(projected, type)) - .build(); + CloseableIterable reader = + builder + .createReaderFunc(type -> SparkParquetReaders.buildReader(projected, type)) + .build(); CloseableIterator actualRows = reader.iterator(); for (GenericData.Record record : expected) { @@ -323,7 +326,8 @@ private void readAndValidate(Expression filter, List expecte } } - private List selectRecords(List records, Pair... ranges) { + private List selectRecords( + List records, Pair... ranges) { return Arrays.stream(ranges) .map(range -> records.subList(range.first(), range.second())) .flatMap(Collection::stream) From 11443cef2bf37de9edc962b9ddf715c699fa652e Mon Sep 17 00:00:00 2001 From: ZhongYujiang <42907416+zhongyujiang@users.noreply.github.com> Date: Wed, 1 Mar 2023 13:59:38 +0800 Subject: [PATCH 09/21] Reset page skipping state for each row group and add a test. --- .../iceberg/parquet/BaseColumnIterator.java | 2 ++ .../data/TestSparkParquetPageSkipping.java | 25 ++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java b/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java index 88531519e977..089cab55f5ba 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java @@ -59,6 +59,8 @@ public void setPageSource(PageReader source, Optional rowRanges) { this.synchronizing = true; this.rowIndexes = rowRanges.get().iterator(); this.targetRowIndex = Long.MIN_VALUE; + } else { + this.synchronizing = false; } BasePageIterator pageIterator = pageIterator(); diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetPageSkipping.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetPageSkipping.java index cfdc33c9552c..a9d52bad82fc 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetPageSkipping.java +++ b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetPageSkipping.java @@ -101,7 +101,9 @@ public class TestSparkParquetPageSkipping { private File testFile; private List allRecords = Lists.newArrayList(); - /* Column and offset indexes info of `_long` column in `testFile` printed by parquet-cli's column-index command: + /* Column and offset indexes info of `_long` column in `testFile` copied from text printed by parquet-cli's + column-index command: + row-group 0: column index for column _long: Boudary order: ASCENDING @@ -264,6 +266,27 @@ public void testMultipleRowGroupsMatch() { readAndValidate(filter, expected); } + @Test + public void testOnlyFilterPagesOnOneRowGroup() { + Expression filter = + Expressions.and( + Expressions.greaterThanOrEqual("_long", 57), + Expressions.lessThan("_long", 114)); // exactly page-1 -> row ranges: [57, 113] + + filter = + Expressions.or( + filter, + // page-9, page-10 in row group 0 -> row ranges: [513, 592] + // and all pages in row group 1 + Expressions.greaterThanOrEqual("_long", 569)); + + // some pages of row group 0 and all pages of row group 1 + List expected = + selectRecords(allRecords, Pair.of(57, 114), Pair.of(513, 593), Pair.of(593, 1000)); + + readAndValidate(filter, expected); + } + @Test public void testNoRowsMatch() { Expression filter = From ad053284fb9f0f70323d7874fab57900ada2e3b1 Mon Sep 17 00:00:00 2001 From: ZhongYujiang <42907416+zhongyujiang@users.noreply.github.com> Date: Wed, 1 Mar 2023 14:23:20 +0800 Subject: [PATCH 10/21] Fix. --- .../java/org/apache/iceberg/parquet/BaseColumnIterator.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java b/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java index 089cab55f5ba..af1101699b4a 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java @@ -68,7 +68,9 @@ public void setPageSource(PageReader source, Optional rowRanges) { dictionary = ParquetUtil.readDictionary(desc, pageSource); pageIterator.setDictionary(dictionary); advance(); - skip(); + if (synchronizing) { + skip(); + } } protected abstract BasePageIterator pageIterator(); From 892dba9c9dfe1e35e1dcd54dac709291cb2963fa Mon Sep 17 00:00:00 2001 From: ZhongYujiang <42907416+zhongyujiang@users.noreply.github.com> Date: Wed, 1 Mar 2023 14:28:17 +0800 Subject: [PATCH 11/21] Refactor tests. --- .../org/apache/iceberg/parquet/TestColumnIndexFilter.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java index 258fde17c07e..e5012f2bc4c3 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java @@ -888,7 +888,7 @@ public void testBinaryBackedDecimal() { .named(binaryDecimal), ASCENDING) .addPage(0, decimalToBytes("12.34"), decimalToBytes("12.35")) - .addPage(0, decimalToBytes("123456789.87654321"), decimalToBytes("123456789.87654323")) + .addPage(0, decimalToBytes("1234567890.987654321"), decimalToBytes("1234567890.987654323")) .build(); OffsetIndex binaryDecimalOI = new OIBuilder().addPage(5).addPage(4).build(); @@ -932,14 +932,14 @@ public OffsetIndex getOffsetIndex(ColumnPath columnPath) { Expression expr = or( lessThan(binaryDecimal, new BigDecimal("12.34")), - greaterThanOrEqual(binaryDecimal, new BigDecimal("123456789.87654322"))); + greaterThanOrEqual(binaryDecimal, new BigDecimal("1234567890.987654322"))); RowRanges expected = selectRowRanges(binaryDecimal, columnIndexStore, rowCount, 1); RowRanges actual = calculateRowRanges(schema, messageType, expr, true, columnIndexStore, rowCount); assertRowRangesEquals(expected, actual); - expr = greaterThan(binaryDecimal, new BigDecimal("123456789.87654323")); + expr = greaterThan(binaryDecimal, new BigDecimal("1234567890.987654323")); expected = NO_ROWS; actual = calculateRowRanges(schema, messageType, expr, true, columnIndexStore, rowCount); assertRowRangesEquals(expected, actual); From 1639d3caf86e95fb29972c2d80a3fb1936d8be1b Mon Sep 17 00:00:00 2001 From: ZhongYujiang <42907416+zhongyujiang@users.noreply.github.com> Date: Wed, 1 Mar 2023 15:42:27 +0800 Subject: [PATCH 12/21] Spotless. --- .../java/org/apache/iceberg/parquet/TestColumnIndexFilter.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java index e5012f2bc4c3..1d5d547f96ee 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java @@ -888,7 +888,8 @@ public void testBinaryBackedDecimal() { .named(binaryDecimal), ASCENDING) .addPage(0, decimalToBytes("12.34"), decimalToBytes("12.35")) - .addPage(0, decimalToBytes("1234567890.987654321"), decimalToBytes("1234567890.987654323")) + .addPage( + 0, decimalToBytes("1234567890.987654321"), decimalToBytes("1234567890.987654323")) .build(); OffsetIndex binaryDecimalOI = new OIBuilder().addPage(5).addPage(4).build(); From b90f1e7996e6a800c83ba2743ca68f549a3ba35e Mon Sep 17 00:00:00 2001 From: ZhongYujiang <42907416+zhongyujiang@users.noreply.github.com> Date: Wed, 1 Mar 2023 18:50:05 +0800 Subject: [PATCH 13/21] Fix comment. --- .../org/apache/iceberg/parquet/ParquetColumnIndexFilter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java index be44ef68c042..461ef9e14c87 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java @@ -530,8 +530,8 @@ private RowRanges applyPredicate( } // If the column index of a column is not available, we cannot filter on this column. - // If the offset index of a column is not available, we cannot filter on this row group. - // Get the offset index first so that the MissingOffsetIndexException (if any) is thrown ASAP. + // If the offset index of a column is not available, a MissingOffsetIndexException will + // be thrown out, and we cannot filter on this row group. OffsetIndex offsetIndex = offsetIndex(columnId); ParquetColumnIndex columnIndex = columnIndex(columnId); if (columnIndex == null) { From 2f4aa339487595b05dbc7e5427d0807207b74a6c Mon Sep 17 00:00:00 2001 From: ZhongYujiang <42907416+zhongyujiang@users.noreply.github.com> Date: Mon, 6 Mar 2023 19:34:45 +0800 Subject: [PATCH 14/21] Refactor synchronization. --- .../iceberg/parquet/BaseColumnIterator.java | 22 ++------ .../iceberg/parquet/ColumnIterator.java | 50 ++++++++++--------- .../iceberg/parquet/ParquetValueReaders.java | 12 +++++ .../iceberg/parquet/TripleIterator.java | 13 +++++ .../spark/data/SparkParquetReaders.java | 4 +- 5 files changed, 57 insertions(+), 44 deletions(-) diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java b/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java index af1101699b4a..f7b8b285578c 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java @@ -19,7 +19,6 @@ package org.apache.iceberg.parquet; import java.util.Optional; -import java.util.PrimitiveIterator; import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.column.Dictionary; import org.apache.parquet.column.page.DataPage; @@ -39,9 +38,7 @@ public abstract class BaseColumnIterator { protected Dictionary dictionary; // state for page skipping - protected boolean synchronizing = false; - protected PrimitiveIterator.OfLong rowIndexes; - protected long targetRowIndex; + protected boolean needsSynchronize; protected long currentRowIndex; protected int skipValues; @@ -55,30 +52,17 @@ public void setPageSource(PageReader source, Optional rowRanges) { this.triplesCount = source.getTotalValueCount(); this.triplesRead = 0L; this.advanceNextPageCount = 0L; - if (rowRanges.isPresent()) { - this.synchronizing = true; - this.rowIndexes = rowRanges.get().iterator(); - this.targetRowIndex = Long.MIN_VALUE; - } else { - this.synchronizing = false; - } + this.needsSynchronize = rowRanges.isPresent(); BasePageIterator pageIterator = pageIterator(); pageIterator.reset(); dictionary = ParquetUtil.readDictionary(desc, pageSource); pageIterator.setDictionary(dictionary); advance(); - if (synchronizing) { - skip(); - } } protected abstract BasePageIterator pageIterator(); - protected void skip() { - throw new UnsupportedOperationException(); - } - protected void advance() { if (triplesRead >= advanceNextPageCount) { BasePageIterator pageIterator = pageIterator(); @@ -88,7 +72,7 @@ protected void advance() { pageIterator.setPage(page); this.advanceNextPageCount += pageIterator.currentPageCount(); - if (synchronizing) { + if (needsSynchronize) { long firstRowIndex = page.getFirstRowIndex() .orElseThrow( diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ColumnIterator.java b/parquet/src/main/java/org/apache/iceberg/parquet/ColumnIterator.java index f6fa64ec0d3c..3cb2b7799626 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ColumnIterator.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ColumnIterator.java @@ -18,7 +18,11 @@ */ package org.apache.iceberg.parquet; +import java.util.Optional; +import java.util.PrimitiveIterator; import org.apache.parquet.column.ColumnDescriptor; +import org.apache.parquet.column.page.PageReader; +import org.apache.parquet.internal.filter2.columnindex.RowRanges; import org.apache.parquet.io.api.Binary; public abstract class ColumnIterator extends BaseColumnIterator implements TripleIterator { @@ -89,6 +93,8 @@ public Binary next() { } private final PageIterator pageIterator; + private PrimitiveIterator.OfLong rowIndexes; + private long targetRowIndex = Long.MIN_VALUE; private ColumnIterator(ColumnDescriptor desc, String writerVersion) { super(desc); @@ -111,63 +117,49 @@ public int currentRepetitionLevel() { public boolean nextBoolean() { this.triplesRead += 1; advance(); - boolean value = pageIterator.nextBoolean(); - skip(); - return value; + return pageIterator.nextBoolean(); } @Override public int nextInteger() { this.triplesRead += 1; advance(); - int value = pageIterator.nextInteger(); - skip(); - return value; + return pageIterator.nextInteger(); } @Override public long nextLong() { this.triplesRead += 1; advance(); - long value = pageIterator.nextLong(); - skip(); - return value; + return pageIterator.nextLong(); } @Override public float nextFloat() { this.triplesRead += 1; advance(); - float value = pageIterator.nextFloat(); - skip(); - return value; + return pageIterator.nextFloat(); } @Override public double nextDouble() { this.triplesRead += 1; advance(); - double value = pageIterator.nextDouble(); - skip(); - return value; + return pageIterator.nextDouble(); } @Override public Binary nextBinary() { this.triplesRead += 1; advance(); - Binary value = pageIterator.nextBinary(); - skip(); - return value; + return pageIterator.nextBinary(); } @Override public N nextNull() { this.triplesRead += 1; advance(); - N value = pageIterator.nextNull(); - skip(); - return value; + return pageIterator.nextNull(); } @Override @@ -176,11 +168,21 @@ protected BasePageIterator pageIterator() { } @Override - protected void skip() { - if (!synchronizing) { - return; + public void setPageSource(PageReader source, Optional rowRanges) { + super.setPageSource(source, rowRanges); + if (rowRanges.isPresent()) { + this.rowIndexes = rowRanges.get().iterator(); + this.targetRowIndex = Long.MIN_VALUE; } + } + @Override + public boolean needsSynchronize() { + return needsSynchronize; + } + + @Override + public void synchronize() { skipValues = 0; while (hasNext()) { advance(); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java index 7bb5d8537436..e338cd42b443 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java @@ -731,6 +731,8 @@ private interface Setter { private final TripleIterator column; private final List> children; + private boolean topLevel = false; + @SuppressWarnings("unchecked") protected StructReader(List types, List> readers) { this.readers = @@ -752,6 +754,10 @@ protected StructReader(List types, List> readers) { this.column = firstNonNullColumn(children); } + public final void topLevel() { + this.topLevel = true; + } + @Override public final void setPageSource( PageReadStore pageStore, long rowPosition, Optional rowRanges) { @@ -767,6 +773,12 @@ public final TripleIterator column() { @Override public final T read(T reuse) { + if (topLevel && column.needsSynchronize()) { + for (TripleIterator child : children) { + child.synchronize(); + } + } + I intermediate = newStructData(reuse); for (int i = 0; i < readers.length; i += 1) { diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/TripleIterator.java b/parquet/src/main/java/org/apache/iceberg/parquet/TripleIterator.java index 5a833d4c4447..0cb5e18a6631 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/TripleIterator.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/TripleIterator.java @@ -129,4 +129,17 @@ default Binary nextBinary() { * @throws java.util.NoSuchElementException if there are no more elements */ N nextNull(); + + /** + * Returns true when some triples in this iterator might need to be skipped. + * @return + */ + default boolean needsSynchronize() { + return false; + } + + /** + * Skips triples to synchronize the row reading. + */ + default void synchronize() {} } diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java index af16d9bbc290..b54dd1edcca4 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java @@ -132,7 +132,9 @@ private static class ReadBuilder extends TypeWithSchemaVisitor message( Types.StructType expected, MessageType message, List> fieldReaders) { - return struct(expected, message.asGroupType(), fieldReaders); + StructReader struct = (StructReader) struct(expected, message.asGroupType(), fieldReaders); + struct.topLevel(); + return struct; } @Override From dcdfff050c60baa2483d77004cd8e17cc680f83a Mon Sep 17 00:00:00 2001 From: ZhongYujiang <42907416+zhongyujiang@users.noreply.github.com> Date: Mon, 6 Mar 2023 19:39:04 +0800 Subject: [PATCH 15/21] Spotless. --- .../java/org/apache/iceberg/parquet/TripleIterator.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/TripleIterator.java b/parquet/src/main/java/org/apache/iceberg/parquet/TripleIterator.java index 0cb5e18a6631..7eb91d476fd1 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/TripleIterator.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/TripleIterator.java @@ -132,14 +132,13 @@ default Binary nextBinary() { /** * Returns true when some triples in this iterator might need to be skipped. - * @return + * + * @return whether this iterator needs to be synchronized */ default boolean needsSynchronize() { return false; } - /** - * Skips triples to synchronize the row reading. - */ + /** Skips triples to synchronize the row reading. */ default void synchronize() {} } From 147d460fa5953abf9b311c3f8164d67f85e54524 Mon Sep 17 00:00:00 2001 From: ZhongYujiang <42907416+zhongyujiang@users.noreply.github.com> Date: Mon, 6 Mar 2023 21:48:47 +0800 Subject: [PATCH 16/21] Fix style. --- .../parquet/ParquetColumnIndexFilter.java | 50 +++++++++---------- .../iceberg/parquet/ParquetValueReader.java | 2 +- .../org/apache/iceberg/parquet/ReadConf.java | 4 +- 3 files changed, 26 insertions(+), 30 deletions(-) diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java index 461ef9e14c87..7bc01f91f418 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java @@ -18,12 +18,6 @@ */ package org.apache.iceberg.parquet; -import static org.apache.iceberg.parquet.PageSkippingHelpers.allPageIndexes; -import static org.apache.iceberg.parquet.PageSkippingHelpers.allRows; -import static org.apache.iceberg.parquet.PageSkippingHelpers.filterPageIndexes; -import static org.apache.iceberg.parquet.PageSkippingHelpers.intersection; -import static org.apache.iceberg.parquet.PageSkippingHelpers.union; - import java.math.BigDecimal; import java.math.BigInteger; import java.nio.ByteBuffer; @@ -88,7 +82,7 @@ public RowRanges calculateRowRanges( return new ColumnIndexEvalVisitor(fileSchema, columnIndexStore, rowCount).eval(); } catch (ColumnIndexStore.MissingOffsetIndexException e) { LOG.info("Cannot get required offset index; Unable to filter on this row group", e); - return allRows(rowCount); + return PageSkippingHelpers.allRows(rowCount); } } @@ -111,7 +105,7 @@ private class ColumnIndexEvalVisitor private ColumnIndexEvalVisitor( MessageType fileSchema, ColumnIndexStore columnIndexStore, long rowCount) { - this.allRows = allRows(rowCount); + this.allRows = PageSkippingHelpers.allRows(rowCount); this.columnIndexStore = columnIndexStore; this.rowCount = rowCount; @@ -154,12 +148,12 @@ public RowRanges not(RowRanges result) { @Override public RowRanges and(RowRanges left, RowRanges right) { - return intersection(left, right); + return PageSkippingHelpers.intersection(left, right); } @Override public RowRanges or(RowRanges left, RowRanges right) { - return union(left, right); + return PageSkippingHelpers.union(left, right); } @Override @@ -169,11 +163,12 @@ public RowRanges isNull(BoundReference ref) { Function func = columnIndex -> { if (columnIndex.hasNullCounts()) { - return filterPageIndexes(columnIndex.pageCount(), columnIndex::containsNull); + return PageSkippingHelpers.filterPageIndexes( + columnIndex.pageCount(), columnIndex::containsNull); } else { // Searching for nulls so if we don't have null related statistics we have to return // all pages - return allPageIndexes(columnIndex.pageCount()); + return PageSkippingHelpers.allPageIndexes(columnIndex.pageCount()); } }; @@ -192,7 +187,9 @@ public RowRanges notNull(BoundReference ref) { } Function func = - columnIndex -> filterPageIndexes(columnIndex.pageCount(), columnIndex::isNonNullPage); + columnIndex -> + PageSkippingHelpers.filterPageIndexes( + columnIndex.pageCount(), columnIndex::isNonNullPage); return applyPredicate(id, func, ROWS_CANNOT_MATCH); } @@ -202,7 +199,9 @@ public RowRanges isNaN(BoundReference ref) { int id = ref.fieldId(); Function func = - columnIndex -> filterPageIndexes(columnIndex.pageCount(), columnIndex::isNonNullPage); + columnIndex -> + PageSkippingHelpers.filterPageIndexes( + columnIndex.pageCount(), columnIndex::isNonNullPage); return applyPredicate(id, func, ROWS_CANNOT_MATCH); } @@ -234,7 +233,7 @@ public RowRanges lt(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; }; - return filterPageIndexes(columnIndex.pageCount(), filter); + return PageSkippingHelpers.filterPageIndexes(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); @@ -260,7 +259,7 @@ public RowRanges ltEq(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; }; - return filterPageIndexes(columnIndex.pageCount(), filter); + return PageSkippingHelpers.filterPageIndexes(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); @@ -285,7 +284,7 @@ public RowRanges gt(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; }; - return filterPageIndexes(columnIndex.pageCount(), filter); + return PageSkippingHelpers.filterPageIndexes(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); @@ -310,7 +309,7 @@ public RowRanges gtEq(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; }; - return filterPageIndexes(columnIndex.pageCount(), filter); + return PageSkippingHelpers.filterPageIndexes(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); @@ -341,7 +340,7 @@ public RowRanges eq(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; }; - return filterPageIndexes(columnIndex.pageCount(), filter); + return PageSkippingHelpers.filterPageIndexes(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); @@ -378,7 +377,7 @@ public RowRanges in(BoundReference ref, Set literalSet) { return ROWS_MIGHT_MATCH; }; - return filterPageIndexes(columnIndex.pageCount(), filter); + return PageSkippingHelpers.filterPageIndexes(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); @@ -450,7 +449,7 @@ public RowRanges startsWith(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; }; - return filterPageIndexes(columnIndex.pageCount(), filter); + return PageSkippingHelpers.filterPageIndexes(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); @@ -514,7 +513,7 @@ public RowRanges notStartsWith(BoundReference ref, Literal lit) { filter = pageIndex -> ROWS_MIGHT_MATCH; } - return filterPageIndexes(columnIndex.pageCount(), filter); + return PageSkippingHelpers.filterPageIndexes(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_MIGHT_MATCH); @@ -614,11 +613,11 @@ private List nullPages() { } private T min(int pageIndex) { - return fromBytes(minBuffer(pageIndex), primitiveType, icebergType); + return fromBytes(minBuffer(pageIndex)); } private T max(int pageIndex) { - return fromBytes(maxBuffer(pageIndex), primitiveType, icebergType); + return fromBytes(maxBuffer(pageIndex)); } private Boolean isNullPage(int pageIndex) { @@ -650,8 +649,7 @@ private int pageCount() { } @SuppressWarnings("unchecked") - private T fromBytes( - ByteBuffer bytes, PrimitiveType primitiveType, Type.PrimitiveType icebergType) { + private T fromBytes(ByteBuffer bytes) { LogicalTypeAnnotation logicalTypeAnnotation = primitiveType.getLogicalTypeAnnotation(); Optional converted = logicalTypeAnnotation == null diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReader.java index adc802ba1c01..bfa1219c3993 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReader.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReader.java @@ -30,5 +30,5 @@ public interface ParquetValueReader { List> columns(); - void setPageSource(PageReadStore pageStore, long rowPosition, Optional RowRanges); + void setPageSource(PageReadStore pageStore, long rowPosition, Optional rowRanges); } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java b/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java index aaaba05b28df..986c1100b98e 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java @@ -18,8 +18,6 @@ */ package org.apache.iceberg.parquet; -import static org.apache.iceberg.parquet.PageSkippingHelpers.getColumnIndexStore; - import java.io.IOException; import java.io.UncheckedIOException; import java.util.List; @@ -134,7 +132,7 @@ class ReadConf { typeWithIds, rowGroup, reader.getBloomFilterDataReader(rowGroup))); if (useColumnIndexFilter && filter != null && shouldRead) { - ColumnIndexStore columnIndexStore = getColumnIndexStore(reader, i); + ColumnIndexStore columnIndexStore = PageSkippingHelpers.getColumnIndexStore(reader, i); RowRanges rowRanges = columnIndexFilter.calculateRowRanges( typeWithIds, columnIndexStore, rowGroup.getRowCount()); From 9948edea5892ce96a0ba8f781d13821abedfe7c3 Mon Sep 17 00:00:00 2001 From: ZhongYujiang <42907416+zhongyujiang@users.noreply.github.com> Date: Sun, 17 Sep 2023 19:59:02 +0800 Subject: [PATCH 17/21] Use API from parquet-mr 1.13 and remove PageSkippingHelpers. --- .../iceberg/parquet/PageSkippingHelpers.java | 128 ------------------ .../parquet/ParquetColumnIndexFilter.java | 44 +++--- .../apache/iceberg/parquet/ParquetReader.java | 4 +- .../org/apache/iceberg/parquet/ReadConf.java | 2 +- .../parquet/TestColumnIndexFilter.java | 13 +- 5 files changed, 29 insertions(+), 162 deletions(-) delete mode 100644 parquet/src/main/java/org/apache/iceberg/parquet/PageSkippingHelpers.java diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/PageSkippingHelpers.java b/parquet/src/main/java/org/apache/iceberg/parquet/PageSkippingHelpers.java deleted file mode 100644 index c56bdf0b2d7d..000000000000 --- a/parquet/src/main/java/org/apache/iceberg/parquet/PageSkippingHelpers.java +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.parquet; - -import java.util.List; -import java.util.PrimitiveIterator; -import java.util.function.IntPredicate; -import org.apache.iceberg.common.DynConstructors; -import org.apache.iceberg.common.DynMethods; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.parquet.column.page.PageReadStore; -import org.apache.parquet.hadoop.ParquetFileReader; -import org.apache.parquet.hadoop.metadata.BlockMetaData; -import org.apache.parquet.internal.column.columnindex.OffsetIndex; -import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; -import org.apache.parquet.internal.filter2.columnindex.RowRanges; - -/** Helper methods for page skipping. */ -class PageSkippingHelpers { - private PageSkippingHelpers() {} - - private static final DynConstructors.Ctor RANGES_LIST_CTOR = - DynConstructors.builder().hiddenImpl(RowRanges.class, List.class).build(); - - private static final RowRanges EMPTY = RANGES_LIST_CTOR.newInstance(ImmutableList.of()); - - static RowRanges empty() { - return EMPTY; - } - - private static final DynMethods.StaticMethod UNION = - DynMethods.builder("union") - .hiddenImpl(RowRanges.class, RowRanges.class, RowRanges.class) - .buildStatic(); - - static RowRanges union(RowRanges left, RowRanges right) { - return UNION.invoke(left, right); - } - - private static final DynMethods.StaticMethod INTERSECTION = - DynMethods.builder("intersection") - .hiddenImpl(RowRanges.class, RowRanges.class, RowRanges.class) - .buildStatic(); - - static RowRanges intersection(RowRanges left, RowRanges right) { - return INTERSECTION.invoke(left, right); - } - - private static final DynMethods.StaticMethod ROW_RANGES_CREATE = - DynMethods.builder("create") - .hiddenImpl(RowRanges.class, long.class, PrimitiveIterator.OfInt.class, OffsetIndex.class) - .buildStatic(); - - static RowRanges createRowRanges( - long rowCount, PrimitiveIterator.OfInt pageIndexes, OffsetIndex offsetIndex) { - return ROW_RANGES_CREATE.invoke(rowCount, pageIndexes, offsetIndex); - } - - private static final DynMethods.StaticMethod ROW_RANGES_CREATE_SINGLE = - DynMethods.builder("createSingle").hiddenImpl(RowRanges.class, long.class).buildStatic(); - - static RowRanges allRows(long rowCount) { - return ROW_RANGES_CREATE_SINGLE.invoke(rowCount); - } - - private static final DynMethods.StaticMethod INDEX_ITERATOR_ALL = - DynMethods.builder("all") - .hiddenImpl("org.apache.parquet.internal.column.columnindex.IndexIterator", int.class) - .buildStatic(); - - static PrimitiveIterator.OfInt allPageIndexes(int pageCount) { - return INDEX_ITERATOR_ALL.invoke(pageCount); - } - - private static final DynMethods.StaticMethod INDEX_ITERATOR_FILTER = - DynMethods.builder("filter") - .hiddenImpl( - "org.apache.parquet.internal.column.columnindex.IndexIterator", - int.class, - IntPredicate.class) - .buildStatic(); - - static PrimitiveIterator.OfInt filterPageIndexes(int pageCount, IntPredicate filter) { - return INDEX_ITERATOR_FILTER.invoke(pageCount, filter); - } - - private static final DynMethods.UnboundMethod GET_COLUMN_INDEX_STORE = - DynMethods.builder("getColumnIndexStore") - .hiddenImpl("org.apache.parquet.hadoop.ParquetFileReader", int.class) - .build(); - - static ColumnIndexStore getColumnIndexStore(ParquetFileReader reader, int blockIndex) { - return GET_COLUMN_INDEX_STORE.invoke(reader, blockIndex); - } - - private static final DynMethods.UnboundMethod INTERNAL_READ_FILTERED_ROW_GROUP = - DynMethods.builder("internalReadFilteredRowGroup") - .hiddenImpl( - "org.apache.parquet.hadoop.ParquetFileReader", - BlockMetaData.class, - RowRanges.class, - ColumnIndexStore.class) - .build(); - - static PageReadStore internalReadFilteredRowGroup( - ParquetFileReader reader, int blockIndex, RowRanges rowRanges) { - ColumnIndexStore columnIndexStore = GET_COLUMN_INDEX_STORE.invoke(reader, blockIndex); - BlockMetaData blockMetaData = reader.getRowGroups().get(blockIndex); - return INTERNAL_READ_FILTERED_ROW_GROUP.invoke( - reader, blockMetaData, rowRanges, columnIndexStore); - } -} diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java index 7bc01f91f418..d3e3b3a937d4 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetColumnIndexFilter.java @@ -46,6 +46,7 @@ import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.internal.column.columnindex.ColumnIndex; +import org.apache.parquet.internal.column.columnindex.IndexIterator; import org.apache.parquet.internal.column.columnindex.OffsetIndex; import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore; import org.apache.parquet.internal.filter2.columnindex.RowRanges; @@ -82,13 +83,13 @@ public RowRanges calculateRowRanges( return new ColumnIndexEvalVisitor(fileSchema, columnIndexStore, rowCount).eval(); } catch (ColumnIndexStore.MissingOffsetIndexException e) { LOG.info("Cannot get required offset index; Unable to filter on this row group", e); - return PageSkippingHelpers.allRows(rowCount); + return RowRanges.createSingle(rowCount); } } private static final boolean ROWS_MIGHT_MATCH = true; private static final boolean ROWS_CANNOT_MATCH = false; - private static final RowRanges NO_ROWS = PageSkippingHelpers.empty(); + private static final RowRanges NO_ROWS = RowRanges.EMPTY; private class ColumnIndexEvalVisitor extends ExpressionVisitors.BoundExpressionVisitor { @@ -105,7 +106,7 @@ private class ColumnIndexEvalVisitor private ColumnIndexEvalVisitor( MessageType fileSchema, ColumnIndexStore columnIndexStore, long rowCount) { - this.allRows = PageSkippingHelpers.allRows(rowCount); + this.allRows = RowRanges.createSingle(rowCount); this.columnIndexStore = columnIndexStore; this.rowCount = rowCount; @@ -148,12 +149,12 @@ public RowRanges not(RowRanges result) { @Override public RowRanges and(RowRanges left, RowRanges right) { - return PageSkippingHelpers.intersection(left, right); + return RowRanges.intersection(left, right); } @Override public RowRanges or(RowRanges left, RowRanges right) { - return PageSkippingHelpers.union(left, right); + return RowRanges.union(left, right); } @Override @@ -163,12 +164,11 @@ public RowRanges isNull(BoundReference ref) { Function func = columnIndex -> { if (columnIndex.hasNullCounts()) { - return PageSkippingHelpers.filterPageIndexes( - columnIndex.pageCount(), columnIndex::containsNull); + return IndexIterator.filter(columnIndex.pageCount(), columnIndex::containsNull); } else { // Searching for nulls so if we don't have null related statistics we have to return // all pages - return PageSkippingHelpers.allPageIndexes(columnIndex.pageCount()); + return IndexIterator.all(columnIndex.pageCount()); } }; @@ -187,9 +187,7 @@ public RowRanges notNull(BoundReference ref) { } Function func = - columnIndex -> - PageSkippingHelpers.filterPageIndexes( - columnIndex.pageCount(), columnIndex::isNonNullPage); + columnIndex -> IndexIterator.filter(columnIndex.pageCount(), columnIndex::isNonNullPage); return applyPredicate(id, func, ROWS_CANNOT_MATCH); } @@ -199,9 +197,7 @@ public RowRanges isNaN(BoundReference ref) { int id = ref.fieldId(); Function func = - columnIndex -> - PageSkippingHelpers.filterPageIndexes( - columnIndex.pageCount(), columnIndex::isNonNullPage); + columnIndex -> IndexIterator.filter(columnIndex.pageCount(), columnIndex::isNonNullPage); return applyPredicate(id, func, ROWS_CANNOT_MATCH); } @@ -233,7 +229,7 @@ public RowRanges lt(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; }; - return PageSkippingHelpers.filterPageIndexes(columnIndex.pageCount(), filter); + return IndexIterator.filter(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); @@ -259,7 +255,7 @@ public RowRanges ltEq(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; }; - return PageSkippingHelpers.filterPageIndexes(columnIndex.pageCount(), filter); + return IndexIterator.filter(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); @@ -284,7 +280,8 @@ public RowRanges gt(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; }; - return PageSkippingHelpers.filterPageIndexes(columnIndex.pageCount(), filter); + + return IndexIterator.filter(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); @@ -309,7 +306,8 @@ public RowRanges gtEq(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; }; - return PageSkippingHelpers.filterPageIndexes(columnIndex.pageCount(), filter); + + return IndexIterator.filter(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); @@ -340,7 +338,7 @@ public RowRanges eq(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; }; - return PageSkippingHelpers.filterPageIndexes(columnIndex.pageCount(), filter); + return IndexIterator.filter(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); @@ -377,7 +375,7 @@ public RowRanges in(BoundReference ref, Set literalSet) { return ROWS_MIGHT_MATCH; }; - return PageSkippingHelpers.filterPageIndexes(columnIndex.pageCount(), filter); + return IndexIterator.filter(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); @@ -449,7 +447,7 @@ public RowRanges startsWith(BoundReference ref, Literal lit) { return ROWS_MIGHT_MATCH; }; - return PageSkippingHelpers.filterPageIndexes(columnIndex.pageCount(), filter); + return IndexIterator.filter(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_CANNOT_MATCH); @@ -513,7 +511,7 @@ public RowRanges notStartsWith(BoundReference ref, Literal lit) { filter = pageIndex -> ROWS_MIGHT_MATCH; } - return PageSkippingHelpers.filterPageIndexes(columnIndex.pageCount(), filter); + return IndexIterator.filter(columnIndex.pageCount(), filter); }; return applyPredicate(id, func, ROWS_MIGHT_MATCH); @@ -540,7 +538,7 @@ private RowRanges applyPredicate( return allRows; } - return PageSkippingHelpers.createRowRanges(rowCount, func.apply(columnIndex), offsetIndex); + return RowRanges.create(rowCount, func.apply(columnIndex), offsetIndex); } // Assumes that the column corresponding to the id exists in the file. diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java index bb9fd6fb423b..8ccfbbf7d212 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java @@ -153,9 +153,7 @@ private void advance() { Optional rowRanges = Optional.ofNullable(rowRangesArr[nextRowGroup]); try { if (rowRanges.isPresent()) { - pages = - PageSkippingHelpers.internalReadFilteredRowGroup( - reader, nextRowGroup, rowRanges.get()); + pages = reader.readFilteredRowGroup(nextRowGroup, rowRanges.get()); } else { pages = reader.readRowGroup(nextRowGroup); } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java b/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java index 986c1100b98e..ff9d275c8193 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ReadConf.java @@ -132,7 +132,7 @@ class ReadConf { typeWithIds, rowGroup, reader.getBloomFilterDataReader(rowGroup))); if (useColumnIndexFilter && filter != null && shouldRead) { - ColumnIndexStore columnIndexStore = PageSkippingHelpers.getColumnIndexStore(reader, i); + ColumnIndexStore columnIndexStore = reader.getColumnIndexStore(i); RowRanges rowRanges = columnIndexFilter.calculateRowRanges( typeWithIds, columnIndexStore, rowGroup.getRowCount()); diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java index 1d5d547f96ee..6d3c376d6668 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java @@ -409,9 +409,9 @@ public OffsetIndex getOffsetIndex(ColumnPath column) { * 29. 26 Alfa null null null * */ - private static final RowRanges ALL_ROWS = PageSkippingHelpers.allRows(TOTAL_ROW_COUNT); + private static final RowRanges ALL_ROWS = RowRanges.createSingle(TOTAL_ROW_COUNT); - private static final RowRanges NO_ROWS = PageSkippingHelpers.empty(); + private static final RowRanges NO_ROWS = RowRanges.EMPTY; private static RowRanges selectRowRanges(String path, int... pageIndexes) { return selectRowRanges(path, STORE, TOTAL_ROW_COUNT, pageIndexes); @@ -419,7 +419,7 @@ private static RowRanges selectRowRanges(String path, int... pageIndexes) { private static RowRanges selectRowRanges( String path, ColumnIndexStore store, long rowCount, int... pageIndexes) { - return PageSkippingHelpers.createRowRanges( + return RowRanges.create( rowCount, new PrimitiveIterator.OfInt() { int index = -1; @@ -603,8 +603,7 @@ public void testAnd() { assertRowRangesEquals(expected, calculateRowRanges(expr)); expr = and(equal(INT_COL, 2), equal(STR_COL, "Tango")); - expected = - PageSkippingHelpers.intersection(selectRowRanges(INT_COL, 1), selectRowRanges(STR_COL, 2)); + expected = RowRanges.intersection(selectRowRanges(INT_COL, 1), selectRowRanges(STR_COL, 2)); assertRowRangesEquals(expected, calculateRowRanges(expr)); } @@ -617,12 +616,12 @@ public void testOr() { expr = or(equal(INT_COL, 1), equal(INT_COL, 2)); assertRowRangesEquals(expected, calculateRowRanges(expr)); - expected = PageSkippingHelpers.union(selectRowRanges(INT_COL, 0), selectRowRanges(STR_COL, 7)); + expected = RowRanges.union(selectRowRanges(INT_COL, 0), selectRowRanges(STR_COL, 7)); expr = or(equal(INT_COL, 1), equal(STR_COL, "Alfa")); assertRowRangesEquals(expected, calculateRowRanges(expr)); expr = or(equal(INT_COL, 2), equal(STR_COL, "Tango")); - expected = PageSkippingHelpers.union(selectRowRanges(INT_COL, 1), selectRowRanges(STR_COL, 2)); + expected = RowRanges.union(selectRowRanges(INT_COL, 1), selectRowRanges(STR_COL, 2)); assertRowRangesEquals(expected, calculateRowRanges(expr)); } From a7e8fc591ac2c4a0284fe349993e73a62bd09762 Mon Sep 17 00:00:00 2001 From: ZhongYujiang <42907416+zhongyujiang@users.noreply.github.com> Date: Sun, 17 Sep 2023 20:05:51 +0800 Subject: [PATCH 18/21] Fix revapi broken. --- .../iceberg/parquet/BaseColumnIterator.java | 5 +++ .../iceberg/parquet/ColumnIterator.java | 5 +++ .../apache/iceberg/parquet/ParquetReader.java | 21 +++++++++++ .../iceberg/parquet/ParquetValueReader.java | 8 ++++- .../iceberg/parquet/ParquetValueReaders.java | 36 +++++++++++++++++++ 5 files changed, 74 insertions(+), 1 deletion(-) diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java b/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java index f7b8b285578c..199de07160d8 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/BaseColumnIterator.java @@ -47,6 +47,11 @@ protected BaseColumnIterator(ColumnDescriptor descriptor) { this.definitionLevel = desc.getMaxDefinitionLevel() - 1; } + @Deprecated + public void setPageSource(PageReader source) { + setPageSource(source, Optional.empty()); + } + public void setPageSource(PageReader source, Optional rowRanges) { this.pageSource = source; this.triplesCount = source.getTotalValueCount(); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ColumnIterator.java b/parquet/src/main/java/org/apache/iceberg/parquet/ColumnIterator.java index 3cb2b7799626..622b8e5921dc 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ColumnIterator.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ColumnIterator.java @@ -167,6 +167,11 @@ protected BasePageIterator pageIterator() { return pageIterator; } + @Override + public void setPageSource(PageReader source) { + setPageSource(source, Optional.empty()); + } + @Override public void setPageSource(PageReader source, Optional rowRanges) { super.setPageSource(source, rowRanges); diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java index 8ccfbbf7d212..23302169036c 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetReader.java @@ -47,6 +47,27 @@ public class ParquetReader extends CloseableGroup implements CloseableIterabl private final NameMapping nameMapping; private final boolean useColumnIndexFilter; + public ParquetReader( + InputFile input, + Schema expectedSchema, + ParquetReadOptions options, + Function> readerFunc, + NameMapping nameMapping, + Expression filter, + boolean reuseContainers, + boolean caseSensitive) { + this( + input, + expectedSchema, + options, + readerFunc, + nameMapping, + filter, + reuseContainers, + caseSensitive, + false); + } + public ParquetReader( InputFile input, Schema expectedSchema, diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReader.java index bfa1219c3993..bf85ac9b94dd 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReader.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReader.java @@ -30,5 +30,11 @@ public interface ParquetValueReader { List> columns(); - void setPageSource(PageReadStore pageStore, long rowPosition, Optional rowRanges); + void setPageSource(PageReadStore pageStore, long rowPosition); + + default void setPageSource( + PageReadStore pageStore, long rowPosition, Optional rowRanges) { + throw new UnsupportedOperationException( + this.getClass().getName() + " does not implement setPageSource"); + } } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java index e338cd42b443..c15794288ef4 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetValueReaders.java @@ -114,6 +114,9 @@ public List> columns() { return COLUMNS; } + @Override + public void setPageSource(PageReadStore pageStore, long rowPosition) {} + @Override public void setPageSource( PageReadStore pageStore, long rowPosition, Optional rowRanges) {} @@ -178,6 +181,9 @@ public List> columns() { return children; } + @Override + public void setPageSource(PageReadStore pageStore, long rowPosition) {} + @Override public void setPageSource( PageReadStore pageStore, long rowPosition, Optional rowRanges) {} @@ -203,6 +209,11 @@ public List> columns() { return NullReader.COLUMNS; } + @Override + public void setPageSource(PageReadStore pageStore, long rowPosition) { + this.setPageSource(pageStore, rowPosition, Optional.empty()); + } + @Override public void setPageSource( PageReadStore pageStore, long rowPosition, Optional rowRanges) { @@ -242,6 +253,11 @@ protected PrimitiveReader(ColumnDescriptor desc) { this.children = ImmutableList.of(column); } + @Override + public void setPageSource(PageReadStore pageStore, long rowPosition) { + this.setPageSource(pageStore, rowPosition, Optional.empty()); + } + @Override public void setPageSource( PageReadStore pageStore, long rowPosition, Optional rowRanges) { @@ -427,6 +443,11 @@ private static class OptionReader implements ParquetValueReader { this.children = reader.columns(); } + @Override + public void setPageSource(PageReadStore pageStore, long rowPosition) { + this.setPageSource(pageStore, rowPosition, Optional.empty()); + } + @Override public void setPageSource( PageReadStore pageStore, long rowPosition, Optional rowRanges) { @@ -473,6 +494,11 @@ protected RepeatedReader( this.children = reader.columns(); } + @Override + public void setPageSource(PageReadStore pageStore, long rowPosition) { + this.setPageSource(pageStore, rowPosition, Optional.empty()); + } + @Override public void setPageSource( PageReadStore pageStore, long rowPosition, Optional rowRanges) { @@ -593,6 +619,11 @@ protected RepeatedKeyValueReader( .build(); } + @Override + public void setPageSource(PageReadStore pageStore, long rowPosition) { + this.setPageSource(pageStore, rowPosition, Optional.empty()); + } + @Override public void setPageSource( PageReadStore pageStore, long rowPosition, Optional rowRanges) { @@ -758,6 +789,11 @@ public final void topLevel() { this.topLevel = true; } + @Override + public final void setPageSource(PageReadStore pageStore, long rowPosition) { + this.setPageSource(pageStore, rowPosition, Optional.empty()); + } + @Override public final void setPageSource( PageReadStore pageStore, long rowPosition, Optional rowRanges) { From 0dd2696c6d438eec9e1c775c3645296e1b5674ed Mon Sep 17 00:00:00 2001 From: ZhongYujiang <42907416+zhongyujiang@users.noreply.github.com> Date: Sun, 17 Sep 2023 20:06:33 +0800 Subject: [PATCH 19/21] Move tests to Spark-3.4. --- .../org/apache/iceberg/spark/data/SparkParquetReaders.java | 4 +--- .../org/apache/iceberg/spark/data/SparkParquetReaders.java | 4 +++- .../iceberg/spark/data/TestSparkParquetPageSkipping.java | 7 +++++-- 3 files changed, 9 insertions(+), 6 deletions(-) rename spark/{v3.3 => v3.4}/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetPageSkipping.java (98%) diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java index b54dd1edcca4..af16d9bbc290 100644 --- a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java +++ b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java @@ -132,9 +132,7 @@ private static class ReadBuilder extends TypeWithSchemaVisitor message( Types.StructType expected, MessageType message, List> fieldReaders) { - StructReader struct = (StructReader) struct(expected, message.asGroupType(), fieldReaders); - struct.topLevel(); - return struct; + return struct(expected, message.asGroupType(), fieldReaders); } @Override diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java index af16d9bbc290..b54dd1edcca4 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetReaders.java @@ -132,7 +132,9 @@ private static class ReadBuilder extends TypeWithSchemaVisitor message( Types.StructType expected, MessageType message, List> fieldReaders) { - return struct(expected, message.asGroupType(), fieldReaders); + StructReader struct = (StructReader) struct(expected, message.asGroupType(), fieldReaders); + struct.topLevel(); + return struct; } @Override diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetPageSkipping.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetPageSkipping.java similarity index 98% rename from spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetPageSkipping.java rename to spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetPageSkipping.java index a9d52bad82fc..cd8a7d58c432 100644 --- a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetPageSkipping.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetPageSkipping.java @@ -46,6 +46,7 @@ import org.apache.iceberg.parquet.ParquetAvroWriter; import org.apache.iceberg.relocated.com.google.common.base.Function; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders; @@ -323,12 +324,14 @@ private void readAndValidate(Expression filter, List expecte CloseableIterable batches = builder .createBatchedReaderFunc( - type -> VectorizedSparkParquetReaders.buildReader(projected, type, true)) + type -> + VectorizedSparkParquetReaders.buildReader( + projected, type, ImmutableMap.of(), null)) .build(); Iterator expectedIterator = expected.iterator(); for (ColumnarBatch batch : batches) { - TestHelpers.assertEqualsBatch(struct, expectedIterator, batch, true); + TestHelpers.assertEqualsBatch(struct, expectedIterator, batch); } Assert.assertFalse( From e781008ea1ffa98651064b0f0fe6dfebcf60f3e7 Mon Sep 17 00:00:00 2001 From: ZhongYujiang <42907416+zhongyujiang@users.noreply.github.com> Date: Sun, 17 Sep 2023 20:21:16 +0800 Subject: [PATCH 20/21] Update tests. --- .../java/org/apache/iceberg/parquet/TestColumnIndexFilter.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java index 6d3c376d6668..288046544450 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java @@ -592,9 +592,6 @@ public void testAnd() { RowRanges expected; Expression expr; - List columns = FILE_SCHEMA.getColumns(); - columns.forEach(System.out::println); - expected = NO_ROWS; expr = and(equal(INT_COL, 1), equal(INT_COL, 2)); assertRowRangesEquals(expected, calculateRowRanges(expr)); From 95caa9405a4d38944941d8e72f19e2c26e7a8863 Mon Sep 17 00:00:00 2001 From: ZhongYujiang <42907416+zhongyujiang@users.noreply.github.com> Date: Sun, 17 Sep 2023 20:51:56 +0800 Subject: [PATCH 21/21] Fix style. --- .../java/org/apache/iceberg/parquet/TestColumnIndexFilter.java | 1 - 1 file changed, 1 deletion(-) diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java index 288046544450..3b37a3da19dd 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestColumnIndexFilter.java @@ -61,7 +61,6 @@ import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.DecimalUtil; -import org.apache.parquet.column.ColumnDescriptor; import org.apache.parquet.hadoop.metadata.ColumnPath; import org.apache.parquet.internal.column.columnindex.BoundaryOrder; import org.apache.parquet.internal.column.columnindex.ColumnIndex;