Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions core/src/main/java/org/apache/iceberg/InternalData.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import org.apache.iceberg.avro.InternalReader;
import org.apache.iceberg.avro.InternalWriter;
import org.apache.iceberg.common.DynMethods;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.FileAppender;
import org.apache.iceberg.io.InputFile;
Expand Down Expand Up @@ -163,6 +164,11 @@ public interface ReadBuilder {
/** Set a custom class for in-memory objects at the given field ID. */
ReadBuilder setCustomType(int fieldId, Class<? extends StructLike> structClass);

/** Set a filter to apply on result rows if applicable. */
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is very important to mention that the caller still needs to do residual filtering, because some filters might not be supported, and some formats might not support filtering at all.
Something like this:


  /**
   * Pushes down the {@link Expression} filter for the reader to prevent reading unnecessary
   * records. Some readers may not support filtering, or may only support filtering for certain expressions.
   * In this case the reader might return unfiltered or partially filtered rows. It is the caller's responsibility to
   * apply the filter again.
   *
   * @param filter the filter to set
   */

default ReadBuilder filter(Expression newFilter) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: maybe call the parameter filter

return this;
}

/** Build the configured reader. */
<D> CloseableIterable<D> build();
}
Expand Down
46 changes: 46 additions & 0 deletions core/src/test/java/org/apache/iceberg/TestInternalData.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,16 @@
*/
package org.apache.iceberg;

import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES;
import static org.assertj.core.api.Assertions.assertThat;

import java.io.IOException;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.List;
import org.apache.iceberg.data.GenericRecord;
import org.apache.iceberg.data.Record;
import org.apache.iceberg.expressions.Expressions;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.FileAppender;
import org.apache.iceberg.io.FileIO;
Expand Down Expand Up @@ -155,4 +158,47 @@ public void testCustomTypeForNestedField() throws IOException {
}
}
}

@TestTemplate
public void testFilter() throws IOException {
OutputFile outputFile = fileIO.newOutputFile(tempDir.resolve("test." + format).toString());

int numRecords = 1000;
List<Record> testData = Lists.newArrayListWithExpectedSize(numRecords);
for (int i = 0; i < numRecords; i += 1) {
Record record = GenericRecord.create(SIMPLE_SCHEMA.asStruct());
record.set(0, (long) i);
record.set(1, "some_str");
testData.add(record);
}

int numRowBatch = 8 * 10; // 1 row batch contains 10 longs
try (FileAppender<Record> appender =
InternalData.write(format, outputFile)
.set(PARQUET_ROW_GROUP_SIZE_BYTES, Integer.toString(numRowBatch))
.schema(SIMPLE_SCHEMA)
.build()) {
appender.addAll(testData);
}

InputFile inputFile = fileIO.newInputFile(outputFile.location());
List<PartitionData> readRecords = Lists.newArrayList();

try (CloseableIterable<PartitionData> reader =
InternalData.read(format, inputFile)
.project(SIMPLE_SCHEMA)
.setRootType(PartitionData.class)
.filter(Expressions.lessThan("id", 100))
.build()) {
for (PartitionData record : reader) {
readRecords.add(record);
}
}

if (format.equals(FileFormat.PARQUET)) {
assertThat(readRecords).hasSize(100);
} else {
assertThat(readRecords).hasSameSizeAs(testData);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1279,6 +1279,7 @@ public ReadBuilder filterRecords(boolean newFilterRecords) {
return this;
}

@Override
public ReadBuilder filter(Expression newFilter) {
this.filter = newFilter;
return this;
Expand Down