From e236d11327a1486702c43e11095a85302d1a0c8e Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Wed, 30 Jun 2021 22:41:00 -0700 Subject: [PATCH 01/14] Iceberg source split and split reader --- .../iceberg/flink/FlinkConfigOptions.java | 6 + .../iceberg/flink/source/DataIterator.java | 43 ++- .../flink/source/FlinkSplitGenerator.java | 36 ++- .../apache/iceberg/flink/source/Position.java | 93 ++++++ .../iceberg/flink/source/ScanContext.java | 68 ++--- .../reader/ArrayPoolDataIteratorBatcher.java | 117 ++++++++ .../source/reader/DataIteratorBatcher.java | 33 +++ .../reader/DataIteratorReaderFunction.java | 47 +++ .../flink/source/reader/FileRecords.java | 101 +++++++ .../reader/IcebergSourceReaderMetrics.java | 60 ++++ .../reader/IcebergSourceSplitReader.java | 119 ++++++++ .../flink/source/reader/ReaderFunction.java | 33 +++ .../flink/source/reader/RecordFactory.java | 35 +++ .../reader/RecyclableArrayIterator.java | 85 ++++++ .../reader/RowDataIteratorReaderFunction.java | 58 ++++ .../source/reader/RowDataRecordFactory.java | 58 ++++ .../source/split/IcebergSourceSplit.java | 128 ++++++++ .../split/IcebergSourceSplitSerializer.java | 74 +++++ .../iceberg/flink/HadoopTableResource.java | 82 ++++++ .../source/reader/ReaderFunctionTestBase.java | 277 ++++++++++++++++++ .../reader/TestIcebergSourceSplitReader.java | 208 +++++++++++++ .../reader/TestRecyclableArrayIterator.java | 85 ++++++ .../TestRowDataIteratorReaderFunction.java | 67 +++++ .../flink/source/split/SplitHelpers.java | 96 ++++++ .../TestIcebergSourceSplitSerializer.java | 100 +++++++ 25 files changed, 2060 insertions(+), 49 deletions(-) create mode 100644 flink/src/main/java/org/apache/iceberg/flink/source/Position.java create mode 100644 flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java create mode 100644 flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java create mode 100644 flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java create mode 100644 flink/src/main/java/org/apache/iceberg/flink/source/reader/FileRecords.java create mode 100644 flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java create mode 100644 flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java create mode 100644 flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java create mode 100644 flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java create mode 100644 flink/src/main/java/org/apache/iceberg/flink/source/reader/RecyclableArrayIterator.java create mode 100644 flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataIteratorReaderFunction.java create mode 100644 flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java create mode 100644 flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java create mode 100644 flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java create mode 100644 flink/src/test/java/org/apache/iceberg/flink/HadoopTableResource.java create mode 100644 flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java create mode 100644 flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceSplitReader.java create mode 100644 flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRecyclableArrayIterator.java create mode 100644 flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataIteratorReaderFunction.java create mode 100644 flink/src/test/java/org/apache/iceberg/flink/source/split/SplitHelpers.java create mode 100644 flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java diff --git a/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java b/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java index 067abe8a6e41..d3fb0268fa4f 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java +++ b/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java @@ -40,4 +40,10 @@ private FlinkConfigOptions() { .intType() .defaultValue(100) .withDescription("Sets max infer parallelism for source operator."); + + public static final ConfigOption SOURCE_READER_FETCH_BATCH_SIZE = ConfigOptions + .key("source.iceberg.reader.fetch-batch-size") + .intType() + .defaultValue(2048) + .withDescription("The target batch size for split reader fetch."); } diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java b/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java index d470b0752304..67b719aba5cf 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java @@ -29,6 +29,7 @@ import org.apache.iceberg.encryption.InputFilesDecryptor; import org.apache.iceberg.io.CloseableIterator; import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; /** * Flink data iterator that reads {@link CombinedScanTask} into a {@link CloseableIterator} @@ -37,20 +38,46 @@ */ @Internal public class DataIterator implements CloseableIterator { - private final FileScanTaskReader fileScanTaskReader; - private final InputFilesDecryptor inputFilesDecryptor; + private final CombinedScanTask combinedTask; + private Iterator tasks; private CloseableIterator currentIterator; + private Position position; public DataIterator(FileScanTaskReader fileScanTaskReader, CombinedScanTask task, FileIO io, EncryptionManager encryption) { this.fileScanTaskReader = fileScanTaskReader; - this.inputFilesDecryptor = new InputFilesDecryptor(task, io, encryption); + this.combinedTask = task; + this.tasks = task.files().iterator(); this.currentIterator = CloseableIterator.empty(); + // fileOffset starts at -1 because we started + // from an empty iterator that is not from the split files. + this.position = new Position(-1L, 0L); + } + + public void seek(Position startingPosition) { + // skip files + Preconditions.checkArgument(startingPosition.fileOffset() < combinedTask.files().size(), + "Checkpointed file offset is %d, while CombinedScanTask has %d files", + startingPosition.fileOffset(), combinedTask.files().size()); + for (long i = 0L; i < startingPosition.fileOffset(); ++i) { + tasks.next(); + } + updateCurrentIterator(); + // skip records within the file + for (long i = 0; i < startingPosition.recordOffset(); ++i) { + if (hasNext()) { + next(); + } else { + throw new IllegalStateException("Not enough records to skip: " + + startingPosition.recordOffset()); + } + } + this.position.update(startingPosition.fileOffset(), startingPosition.recordOffset()); } @Override @@ -62,9 +89,14 @@ public boolean hasNext() { @Override public T next() { updateCurrentIterator(); + position.advanceRecord(); return currentIterator.next(); } + public boolean isCurrentIteratorDone() { + return !currentIterator.hasNext(); + } + /** * Updates the current iterator field to ensure that the current Iterator * is not exhausted. @@ -74,6 +106,7 @@ private void updateCurrentIterator() { while (!currentIterator.hasNext() && tasks.hasNext()) { currentIterator.close(); currentIterator = openTaskIterator(tasks.next()); + position.advanceFile(); } } catch (IOException e) { throw new UncheckedIOException(e); @@ -90,4 +123,8 @@ public void close() throws IOException { currentIterator.close(); tasks = null; } + + public Position position() { + return position; + } } diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitGenerator.java b/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitGenerator.java index f495e0909b7e..1ba396c187e9 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitGenerator.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitGenerator.java @@ -27,23 +27,39 @@ import org.apache.iceberg.TableProperties; import org.apache.iceberg.TableScan; import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.collect.Lists; -class FlinkSplitGenerator { +public class FlinkSplitGenerator { private FlinkSplitGenerator() { } static FlinkInputSplit[] createInputSplits(Table table, ScanContext context) { - List tasks = tasks(table, context); - FlinkInputSplit[] splits = new FlinkInputSplit[tasks.size()]; - for (int i = 0; i < tasks.size(); i++) { - splits[i] = new FlinkInputSplit(i, tasks.get(i)); + try (CloseableIterable tasksIterable = planTasks(table, context)) { + List tasks = Lists.newArrayList(tasksIterable); + FlinkInputSplit[] splits = new FlinkInputSplit[tasks.size()]; + for (int i = 0; i < tasks.size(); i++) { + splits[i] = new FlinkInputSplit(i, tasks.get(i)); + } + return splits; + } catch (IOException e) { + throw new UncheckedIOException("Failed to process tasks iterable", e); + } + } + + public static List planIcebergSourceSplits( + Table table, ScanContext context) { + try (CloseableIterable tasksIterable = planTasks(table, context)) { + List splits = Lists.newArrayList(); + tasksIterable.forEach(task -> splits.add(IcebergSourceSplit.fromCombinedScanTask(task))); + return splits; + } catch (IOException e) { + throw new UncheckedIOException("Failed to process task iterable: ", e); } - return splits; } - private static List tasks(Table table, ScanContext context) { + static CloseableIterable planTasks(Table table, ScanContext context) { TableScan scan = table .newScan() .caseSensitive(context.caseSensitive()) @@ -83,10 +99,6 @@ private static List tasks(Table table, ScanContext context) { } } - try (CloseableIterable tasksIterable = scan.planTasks()) { - return Lists.newArrayList(tasksIterable); - } catch (IOException e) { - throw new UncheckedIOException("Failed to close table scan: " + scan, e); - } + return scan.planTasks(); } } diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/Position.java b/flink/src/main/java/org/apache/iceberg/flink/source/Position.java new file mode 100644 index 000000000000..9ac2c89e0972 --- /dev/null +++ b/flink/src/main/java/org/apache/iceberg/flink/source/Position.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.flink.source; + +import java.io.Serializable; +import java.util.Objects; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; + +/** + * A mutable class that defines the read position + *
    + *
  • file offset in the list of files in a {@link CombinedScanTask}
  • + *
  • record offset within a file
  • + *
+ */ +public class Position implements Serializable { + + private static final long serialVersionUID = 1L; + + private long fileOffset; + private long recordOffset; + + public Position(long fileOffset, long recordOffset) { + this.fileOffset = fileOffset; + this.recordOffset = recordOffset; + } + + void advanceFile() { + this.fileOffset += 1; + this.recordOffset = 0L; + } + + void advanceRecord() { + this.recordOffset += 1L; + } + + public void update(long newFileOffset, long newRecordOffset) { + this.fileOffset = newFileOffset; + this.recordOffset = newRecordOffset; + } + + public long fileOffset() { + return fileOffset; + } + + public long recordOffset() { + return recordOffset; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + final Position that = (Position) o; + return Objects.equals(fileOffset, that.fileOffset) && + Objects.equals(recordOffset, that.recordOffset); + } + + @Override + public int hashCode() { + return Objects.hash(fileOffset, recordOffset); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("fileOffset", fileOffset) + .add("recordOffset", recordOffset) + .toString(); + } +} diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java b/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java index 2896efb39655..b0336d70f179 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java @@ -34,7 +34,7 @@ /** * Context object with optional arguments for a Flink Scan. */ -class ScanContext implements Serializable { +public class ScanContext implements Serializable { private static final long serialVersionUID = 1L; @@ -105,63 +105,63 @@ private ScanContext(boolean caseSensitive, Long snapshotId, Long startSnapshotId this.limit = limit; } - boolean caseSensitive() { + public boolean caseSensitive() { return caseSensitive; } - Long snapshotId() { + public Long snapshotId() { return snapshotId; } - Long startSnapshotId() { + public Long startSnapshotId() { return startSnapshotId; } - Long endSnapshotId() { + public Long endSnapshotId() { return endSnapshotId; } - Long asOfTimestamp() { + public Long asOfTimestamp() { return asOfTimestamp; } - Long splitSize() { + public Long splitSize() { return splitSize; } - Integer splitLookback() { + public Integer splitLookback() { return splitLookback; } - Long splitOpenFileCost() { + public Long splitOpenFileCost() { return splitOpenFileCost; } - boolean isStreaming() { + public boolean isStreaming() { return isStreaming; } - Duration monitorInterval() { + public Duration monitorInterval() { return monitorInterval; } - String nameMapping() { + public String nameMapping() { return nameMapping; } - Schema project() { + public Schema project() { return schema; } - List filters() { + public List filters() { return filters; } - long limit() { + public long limit() { return limit; } - ScanContext copyWithAppendsBetween(long newStartSnapshotId, long newEndSnapshotId) { + public ScanContext copyWithAppendsBetween(long newStartSnapshotId, long newEndSnapshotId) { return ScanContext.builder() .caseSensitive(caseSensitive) .useSnapshotId(null) @@ -180,7 +180,7 @@ ScanContext copyWithAppendsBetween(long newStartSnapshotId, long newEndSnapshotI .build(); } - ScanContext copyWithSnapshotId(long newSnapshotId) { + public ScanContext copyWithSnapshotId(long newSnapshotId) { return ScanContext.builder() .caseSensitive(caseSensitive) .useSnapshotId(newSnapshotId) @@ -199,11 +199,11 @@ ScanContext copyWithSnapshotId(long newSnapshotId) { .build(); } - static Builder builder() { + public static Builder builder() { return new Builder(); } - static class Builder { + public static class Builder { private boolean caseSensitive = CASE_SENSITIVE.defaultValue(); private Long snapshotId = SNAPSHOT_ID.defaultValue(); private Long startSnapshotId = START_SNAPSHOT_ID.defaultValue(); @@ -222,77 +222,77 @@ static class Builder { private Builder() { } - Builder caseSensitive(boolean newCaseSensitive) { + public Builder caseSensitive(boolean newCaseSensitive) { this.caseSensitive = newCaseSensitive; return this; } - Builder useSnapshotId(Long newSnapshotId) { + public Builder useSnapshotId(Long newSnapshotId) { this.snapshotId = newSnapshotId; return this; } - Builder startSnapshotId(Long newStartSnapshotId) { + public Builder startSnapshotId(Long newStartSnapshotId) { this.startSnapshotId = newStartSnapshotId; return this; } - Builder endSnapshotId(Long newEndSnapshotId) { + public Builder endSnapshotId(Long newEndSnapshotId) { this.endSnapshotId = newEndSnapshotId; return this; } - Builder asOfTimestamp(Long newAsOfTimestamp) { + public Builder asOfTimestamp(Long newAsOfTimestamp) { this.asOfTimestamp = newAsOfTimestamp; return this; } - Builder splitSize(Long newSplitSize) { + public Builder splitSize(Long newSplitSize) { this.splitSize = newSplitSize; return this; } - Builder splitLookback(Integer newSplitLookback) { + public Builder splitLookback(Integer newSplitLookback) { this.splitLookback = newSplitLookback; return this; } - Builder splitOpenFileCost(Long newSplitOpenFileCost) { + public Builder splitOpenFileCost(Long newSplitOpenFileCost) { this.splitOpenFileCost = newSplitOpenFileCost; return this; } - Builder streaming(boolean streaming) { + public Builder streaming(boolean streaming) { this.isStreaming = streaming; return this; } - Builder monitorInterval(Duration newMonitorInterval) { + public Builder monitorInterval(Duration newMonitorInterval) { this.monitorInterval = newMonitorInterval; return this; } - Builder nameMapping(String newNameMapping) { + public Builder nameMapping(String newNameMapping) { this.nameMapping = newNameMapping; return this; } - Builder project(Schema newProjectedSchema) { + public Builder project(Schema newProjectedSchema) { this.projectedSchema = newProjectedSchema; return this; } - Builder filters(List newFilters) { + public Builder filters(List newFilters) { this.filters = newFilters; return this; } - Builder limit(long newLimit) { + public Builder limit(long newLimit) { this.limit = newLimit; return this; } - Builder fromProperties(Map properties) { + public Builder fromProperties(Map properties) { Configuration config = new Configuration(); properties.forEach(config::setString); diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java new file mode 100644 index 000000000000..6b327898a8b1 --- /dev/null +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.flink.source.reader; + +import java.io.IOException; +import java.io.UncheckedIOException; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.base.source.reader.SourceReaderOptions; +import org.apache.flink.connector.file.src.util.Pool; +import org.apache.flink.connector.file.src.util.RecordAndPosition; +import org.apache.iceberg.flink.FlinkConfigOptions; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.flink.source.Position; +import org.apache.iceberg.io.CloseableIterator; + +class ArrayPoolDataIteratorBatcher implements DataIteratorBatcher { + + private final Configuration config; + private final RecordFactory recordFactory; + + ArrayPoolDataIteratorBatcher(Configuration config, RecordFactory recordFactory) { + this.config = config; + this.recordFactory = recordFactory; + } + + @Override + public CloseableIterator>> apply( + String splitId, DataIterator inputIterator) { + return new ArrayPoolBatchIterator(splitId, inputIterator); + } + + private class ArrayPoolBatchIterator implements CloseableIterator>> { + + private final String splitId; + private final DataIterator inputIterator; + private final int batchSize; + private final Pool pool; + + ArrayPoolBatchIterator(String splitId, DataIterator inputIterator) { + this.splitId = splitId; + this.inputIterator = inputIterator; + this.batchSize = config.getInteger(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_SIZE); + this.pool = createPoolOfBatches(config.getInteger(SourceReaderOptions.ELEMENT_QUEUE_CAPACITY)); + } + + @Override + public boolean hasNext() { + return inputIterator.hasNext(); + } + + @Override + public RecordsWithSplitIds> next() { + final T[] batch = getCachedEntry(); + int num = 0; + while (inputIterator.hasNext() && num < batchSize) { + T nextRecord = inputIterator.next(); + recordFactory.clone(nextRecord, batch[num]); + num++; + if (inputIterator.isCurrentIteratorDone()) { + // break early so that records in the ArrayResultIterator + // have the same fileOffset. + break; + } + } + if (num == 0) { + return null; + } else { + Position position = inputIterator.position(); + return FileRecords.forRecords(splitId, new RecyclableArrayIterator<>( + pool.recycler(), batch, num, position.fileOffset(), position.recordOffset() - num)); + } + } + + @Override + public void close() throws IOException { + if (inputIterator != null) { + inputIterator.close(); + } + } + + private Pool createPoolOfBatches(int numBatches) { + final Pool poolOfBatches = new Pool<>(numBatches); + for (int batchId = 0; batchId < numBatches; batchId++) { + T[] batch = recordFactory.createBatch(batchSize); + poolOfBatches.add(batch); + } + return poolOfBatches; + } + + private T[] getCachedEntry() { + try { + return pool.pollEntry(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new UncheckedIOException(new IOException("Interrupted")); + } + } + } +} diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java new file mode 100644 index 000000000000..a296517a1846 --- /dev/null +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.flink.source.reader; + +import java.io.Serializable; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.file.src.util.RecordAndPosition; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.io.CloseableIterator; + +@FunctionalInterface +public interface DataIteratorBatcher extends Serializable { + + CloseableIterator>> apply(String splitId, DataIterator inputIterator); + +} diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java new file mode 100644 index 000000000000..e3e6bfdc0394 --- /dev/null +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.flink.source.reader; + +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.file.src.util.RecordAndPosition; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.io.CloseableIterator; + +public abstract class DataIteratorReaderFunction implements ReaderFunction { + + private final DataIteratorBatcher batcher; + + DataIteratorReaderFunction(DataIteratorBatcher batcher) { + this.batcher = batcher; + } + + public abstract DataIterator createDataIterator(IcebergSourceSplit split); + + @Override + public CloseableIterator>> read(IcebergSourceSplit split) { + DataIterator inputIterator = createDataIterator(split); + if (split.position() != null) { + inputIterator.seek(split.position()); + } + return batcher.apply(split.splitId(), inputIterator); + } + +} diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/FileRecords.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/FileRecords.java new file mode 100644 index 000000000000..256660dd0a7a --- /dev/null +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/FileRecords.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.flink.source.reader; + +import java.io.IOException; +import java.util.Collections; +import java.util.Set; +import javax.annotation.Nullable; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.file.src.util.RecordAndPosition; +import org.apache.iceberg.io.CloseableIterator; + +/** + * A batch of recrods for one split + */ +public class FileRecords implements RecordsWithSplitIds> { + + @Nullable + private final CloseableIterator> recordsForSplit; + private final Set finishedSplits; + + @Nullable + private String splitId; + @Nullable + private CloseableIterator> recordsForSplitCurrent; + + private FileRecords( + @Nullable String splitId, + @Nullable CloseableIterator> recordsForSplit, + Set finishedSplits) { + + this.splitId = splitId; + this.recordsForSplit = recordsForSplit; + this.finishedSplits = finishedSplits; + } + + @Nullable + @Override + public String nextSplit() { + // move the split one (from current value to null) + final String nextSplit = this.splitId; + this.splitId = null; + + // move the iterator, from null to value (if first move) or to null (if second move) + this.recordsForSplitCurrent = nextSplit != null ? this.recordsForSplit : null; + + return nextSplit; + } + + @Nullable + @Override + public RecordAndPosition nextRecordFromSplit() { + if (recordsForSplitCurrent != null) { + return recordsForSplitCurrent.next(); + } else { + throw new IllegalStateException(); + } + } + + @Override + public void recycle() { + if (recordsForSplit != null) { + try { + recordsForSplit.close(); + } catch (IOException e) { + throw new RuntimeException("Failed to close the record batch"); + } + } + } + + @Override + public Set finishedSplits() { + return finishedSplits; + } + + public static FileRecords forRecords( + final String splitId, final CloseableIterator> recordsForSplit) { + return new FileRecords<>(splitId, recordsForSplit, Collections.emptySet()); + } + + public static FileRecords finishedSplit(String splitId) { + return new FileRecords<>(null, null, Collections.singleton(splitId)); + } +} diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java new file mode 100644 index 000000000000..a2aa7d518731 --- /dev/null +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.flink.source.reader; + +import java.util.concurrent.atomic.AtomicLong; +import org.apache.flink.metrics.Counter; +import org.apache.flink.metrics.MetricGroup; + +public class IcebergSourceReaderMetrics { + + private final AtomicLong numRecordsOut; + private final AtomicLong assignedSplits; + private final AtomicLong finishedSplits; + private final Counter splitReaderFetches; + + public IcebergSourceReaderMetrics(MetricGroup metricGroup) { + final MetricGroup readerMetricGroup = metricGroup.addGroup("IcebergSourceReader"); + + this.numRecordsOut = new AtomicLong(); + this.assignedSplits = new AtomicLong(); + this.finishedSplits = new AtomicLong(); + readerMetricGroup.gauge("numRecordsOut", numRecordsOut::get); + readerMetricGroup.gauge("assignedSplits", assignedSplits::get); + readerMetricGroup.gauge("finishedSplits", finishedSplits::get); + this.splitReaderFetches = readerMetricGroup.counter("splitReaderFetches"); + } + + public void incrementNumRecordsOut(long delta) { + numRecordsOut.addAndGet(delta); + } + + public void incrementAssignedSplits(long delta) { + assignedSplits.addAndGet(delta); + } + + public void incrementFinishedSplits(long delta) { + finishedSplits.addAndGet(delta); + } + + public void recordSplitReaderFetches() { + splitReaderFetches.inc(); + } +} diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java new file mode 100644 index 000000000000..9c68b6f6b4ac --- /dev/null +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.flink.source.reader; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.ArrayDeque; +import java.util.Queue; +import javax.annotation.Nullable; +import org.apache.flink.api.connector.source.SourceReaderContext; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.base.source.reader.splitreader.SplitReader; +import org.apache.flink.connector.base.source.reader.splitreader.SplitsChange; +import org.apache.flink.connector.file.src.util.RecordAndPosition; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.io.CloseableIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class IcebergSourceSplitReader implements SplitReader, IcebergSourceSplit> { + private static final Logger LOG = LoggerFactory.getLogger(IcebergSourceSplitReader.class); + + private final ReaderFunction readerFunction; + private final int indexOfSubtask; + private final IcebergSourceReaderMetrics metrics; + + private final Queue splits; + + @Nullable + private CloseableIterator>> currentReader; + @Nullable + private String currentSplitId; + + IcebergSourceSplitReader(ReaderFunction readerFunction, + SourceReaderContext context, + IcebergSourceReaderMetrics metrics) { + this.readerFunction = readerFunction; + this.indexOfSubtask = context.getIndexOfSubtask(); + this.metrics = metrics; + this.splits = new ArrayDeque<>(); + } + + @Override + public RecordsWithSplitIds> fetch() throws IOException { + metrics.recordSplitReaderFetches(); + checkSplitOrStartNext(); + if (currentReader.hasNext()) { + // Because Iterator#next() doesn't support checked exception, + // we need to wrap and unwrap the checked IOException with UncheckedIOException + try { + return currentReader.next(); + } catch (UncheckedIOException e) { + throw e.getCause(); + } + } else { + return finishSplit(); + } + } + + @Override + public void handleSplitsChanges(SplitsChange splitsChanges) { + LOG.debug("Add splits to reader: {}", splitsChanges.splits()); + splits.addAll(splitsChanges.splits()); + metrics.incrementAssignedSplits(splitsChanges.splits().size()); + } + + @Override + public void wakeUp() { + } + + @Override + public void close() throws Exception { + currentSplitId = null; + if (currentReader != null) { + currentReader.close(); + } + } + + private void checkSplitOrStartNext() throws IOException { + if (currentReader != null) { + return; + } + final IcebergSourceSplit nextSplit = splits.poll(); + if (nextSplit == null) { + throw new IOException("No split remaining"); + } + currentSplitId = nextSplit.splitId(); + currentReader = readerFunction.read(nextSplit); + } + + private FileRecords finishSplit() throws IOException { + if (currentReader != null) { + currentReader.close(); + currentReader = null; + } + final FileRecords finishRecords = FileRecords.finishedSplit(currentSplitId); + LOG.debug("Split reader {} finished split: {}", indexOfSubtask, currentSplitId); + currentSplitId = null; + metrics.incrementFinishedSplits(1L); + return finishRecords; + } +} diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java new file mode 100644 index 000000000000..bd6fd097444b --- /dev/null +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.flink.source.reader; + +import java.io.Serializable; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.file.src.util.RecordAndPosition; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.io.CloseableIterator; + +public interface ReaderFunction extends Serializable { + + CloseableIterator>> read(IcebergSourceSplit split); + +} + diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java new file mode 100644 index 000000000000..82deda46018a --- /dev/null +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.flink.source.reader; + +import java.io.Serializable; + +interface RecordFactory extends Serializable { + + /** + * Create a batch of records + */ + T[] createBatch(int batchSize); + + /** + * Clone record + */ + void clone(T from, T to); +} diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecyclableArrayIterator.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecyclableArrayIterator.java new file mode 100644 index 000000000000..63e657b12629 --- /dev/null +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecyclableArrayIterator.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.flink.source.reader; + +import javax.annotation.Nullable; +import org.apache.flink.connector.file.src.util.ArrayResultIterator; +import org.apache.flink.connector.file.src.util.CheckpointedPosition; +import org.apache.flink.connector.file.src.util.MutableRecordAndPosition; +import org.apache.flink.connector.file.src.util.Pool; +import org.apache.flink.connector.file.src.util.RecordAndPosition; +import org.apache.iceberg.io.CloseableIterator; + +/** + * Similar to the {@link ArrayResultIterator}. + * Main difference is the records array can be recycled back to a pool. + */ +final class RecyclableArrayIterator implements CloseableIterator> { + + private final Pool.Recycler recycler; + private final E[] records; + private final int num; + private final MutableRecordAndPosition recordAndPosition; + + private int pos; + + RecyclableArrayIterator(Pool.Recycler recycler) { + this(recycler, null, 0, CheckpointedPosition.NO_OFFSET, 0L); + } + + /** + * Each record's {@link RecordAndPosition} will have the same fileOffset (for {@link RecordAndPosition#getOffset()}. + * The first returned record will have a records-to-skip count of {@code recordOffset + 1}, following + * the contract that each record needs to point to the position AFTER itself + * (because a checkpoint taken after the record was emitted needs to resume from after that record). + */ + RecyclableArrayIterator( + Pool.Recycler recycler, final E[] newRecords, + final int newNum, final long fileOffset, final long recordOffset) { + this.recycler = recycler; + this.records = newRecords; + this.num = newNum; + this.recordAndPosition = new MutableRecordAndPosition<>(); + this.recordAndPosition.set(null, fileOffset, recordOffset); + + this.pos = 0; + } + + @Override + public boolean hasNext() { + return pos < num; + } + + @Override + @Nullable + public RecordAndPosition next() { + if (pos < num) { + recordAndPosition.setNext(records[pos++]); + return recordAndPosition; + } else { + return null; + } + } + + @Override + public void close() { + recycler.recycle(records); + } +} diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataIteratorReaderFunction.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataIteratorReaderFunction.java new file mode 100644 index 000000000000..92e054c0e13b --- /dev/null +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataIteratorReaderFunction.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.flink.source.reader; + +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Table; +import org.apache.iceberg.flink.source.DataIterator; +import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; +import org.apache.iceberg.flink.source.ScanContext; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; + +public class RowDataIteratorReaderFunction extends DataIteratorReaderFunction { + + private final Table table; + private final ScanContext scanContext; + + public RowDataIteratorReaderFunction( + Configuration config, + Table table, + ScanContext scanContext, + RowType rowType) { + super(new ArrayPoolDataIteratorBatcher<>(config, new RowDataRecordFactory(rowType))); + this.table = table; + this.scanContext = scanContext; + } + + @Override + public DataIterator createDataIterator(IcebergSourceSplit split) { + return new DataIterator<>( + new RowDataFileScanTaskReader( + table.schema(), + scanContext.project(), + scanContext.nameMapping(), + scanContext.caseSensitive()), + split.task(), + table.io(), + table.encryption()); + } +} diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java new file mode 100644 index 000000000000..f2ff28cab222 --- /dev/null +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.flink.source.reader; + +import org.apache.flink.api.common.typeutils.TypeSerializer; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.runtime.typeutils.InternalSerializers; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.flink.data.RowDataUtil; + +class RowDataRecordFactory implements RecordFactory { + + private final RowType rowType; + private final TypeSerializer[] fieldSerializers; + + RowDataRecordFactory(final RowType rowType) { + this.rowType = rowType; + this.fieldSerializers = createFieldSerializers(rowType); + } + + static TypeSerializer[] createFieldSerializers(RowType rowType) { + return rowType.getChildren().stream() + .map(InternalSerializers::create) + .toArray(TypeSerializer[]::new); + } + + @Override + public RowData[] createBatch(int batchSize) { + RowData[] arr = new RowData[batchSize]; + for (int i = 0; i < batchSize; ++i) { + arr[i] = new GenericRowData(rowType.getFieldCount()); + } + return arr; + } + + @Override + public void clone(RowData from, RowData to) { + RowDataUtil.clone(from, to, rowType, fieldSerializers); + } +} diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java b/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java new file mode 100644 index 000000000000..02692dd86668 --- /dev/null +++ b/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.flink.source.split; + +import java.io.Serializable; +import java.util.Collection; +import java.util.stream.Collectors; +import javax.annotation.Nullable; +import org.apache.flink.api.connector.source.SourceSplit; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.flink.source.Position; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Objects; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; + +public class IcebergSourceSplit implements SourceSplit, Serializable { + + private final CombinedScanTask task; + + /** + * Position field is mutable + */ + @Nullable + private final Position position; + + /** + * The splits are frequently serialized into checkpoints. + * Caching the byte representation makes repeated serialization cheap. + */ + @Nullable + private transient byte[] serializedFormCache; + + public IcebergSourceSplit(CombinedScanTask task, Position position) { + this.task = task; + this.position = position; + } + + public static IcebergSourceSplit fromCombinedScanTask(CombinedScanTask combinedScanTask) { + return fromCombinedScanTask(combinedScanTask, 0L, 0L); + } + + public static IcebergSourceSplit fromCombinedScanTask( + CombinedScanTask combinedScanTask, long fileOffset, long recordOffset) { + return new IcebergSourceSplit(combinedScanTask, new Position(fileOffset, recordOffset)); + } + + public CombinedScanTask task() { + return task; + } + + public Position position() { + return position; + } + + public byte[] serializedFormCache() { + return serializedFormCache; + } + + public void serializedFormCache(byte[] cachedBytes) { + this.serializedFormCache = cachedBytes; + } + + @Override + public String splitId() { + return MoreObjects.toStringHelper(this) + .add("files", toString(task.files())) + .toString(); + } + + public void updatePosition(long newFileOffset, long newRecordOffset) { + position.update(newFileOffset, newRecordOffset); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + IcebergSourceSplit split = (IcebergSourceSplit) o; + return Objects.equal(splitId(), split.splitId()) && + Objects.equal(position, split.position()); + } + + @Override + public int hashCode() { + return Objects.hashCode(splitId()); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("files", toString(task.files())) + .add("position", position) + .toString(); + } + + private String toString(Collection files) { + return Iterables.toString(files.stream().map(fileScanTask -> + MoreObjects.toStringHelper(fileScanTask) + .add("file", fileScanTask.file() != null ? + fileScanTask.file().path().toString() : + "NoFile") + .add("start", fileScanTask.start()) + .add("length", fileScanTask.length()) + .toString()).collect(Collectors.toList())); + } +} diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java b/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java new file mode 100644 index 000000000000..dcac972bd06f --- /dev/null +++ b/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.flink.source.split; + +import java.io.IOException; +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.core.io.SimpleVersionedSerializer; +import org.apache.flink.util.InstantiationUtil; + +/** + * TODO: use Java serialization for now. + * will switch to more stable serializer from issue-1698. + */ +public class IcebergSourceSplitSerializer implements SimpleVersionedSerializer { + + public static final IcebergSourceSplitSerializer INSTANCE = new IcebergSourceSplitSerializer(); + + private static final int VERSION = 1; + + @Override + public int getVersion() { + return VERSION; + } + + @Override + public byte[] serialize(IcebergSourceSplit split) throws IOException { + if (split.serializedFormCache() == null) { + final byte[] result = serializeV1(split); + split.serializedFormCache(result); + } + return split.serializedFormCache(); + } + + @Override + public IcebergSourceSplit deserialize(int version, byte[] serialized) throws IOException { + switch (version) { + case 1: + return deserializeV1(serialized); + default: + throw new IOException("Unknown version: " + version); + } + } + + @VisibleForTesting + byte[] serializeV1(IcebergSourceSplit split) throws IOException { + return InstantiationUtil.serializeObject(split); + } + + @VisibleForTesting + IcebergSourceSplit deserializeV1(byte[] serialized) throws IOException { + try { + return InstantiationUtil.deserializeObject(serialized, getClass().getClassLoader()); + } catch (ClassNotFoundException e) { + throw new RuntimeException("Failed to deserialize the split.", e); + } + } +} diff --git a/flink/src/test/java/org/apache/iceberg/flink/HadoopTableResource.java b/flink/src/test/java/org/apache/iceberg/flink/HadoopTableResource.java new file mode 100644 index 000000000000..a205b22f3ed5 --- /dev/null +++ b/flink/src/test/java/org/apache/iceberg/flink/HadoopTableResource.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.flink; + +import java.io.File; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.hadoop.HadoopCatalog; +import org.junit.Assert; +import org.junit.rules.ExternalResource; +import org.junit.rules.TemporaryFolder; + +public class HadoopTableResource extends ExternalResource { + + private final TemporaryFolder temporaryFolder; + private final String database; + private final String tableName; + private final Schema schema; + + private HadoopCatalog catalog; + private TableLoader tableLoader; + private Table table; + + public HadoopTableResource(TemporaryFolder temporaryFolder, String database, String tableName, Schema schema) { + this.temporaryFolder = temporaryFolder; + this.database = database; + this.tableName = tableName; + this.schema = schema; + } + + @Override + protected void before() throws Throwable { + File warehouseFile = temporaryFolder.newFolder(); + Assert.assertTrue(warehouseFile.delete()); + // before variables + String warehouse = "file:" + warehouseFile; + Configuration hadoopConf = new Configuration(); + this.catalog = new HadoopCatalog(hadoopConf, warehouse); + String location = String.format("%s/%s/%s", warehouse, database, tableName); + this.tableLoader = TableLoader.fromHadoopTable(location); + this.table = catalog.createTable(TableIdentifier.of(database, tableName), schema); + tableLoader.open(); + } + + @Override + protected void after() { + try { + catalog.dropTable(TableIdentifier.of(database, tableName)); + catalog.close(); + tableLoader.close(); + } catch (Exception e) { + throw new RuntimeException("Failed to close catalog resource"); + } + } + + public TableLoader tableLoader() { + return tableLoader; + } + + public Table table() { + return table; + } +} diff --git a/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java b/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java new file mode 100644 index 000000000000..54abab75341d --- /dev/null +++ b/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java @@ -0,0 +1,277 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.flink.source.reader; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.file.src.util.RecordAndPosition; +import org.apache.iceberg.BaseCombinedScanTask; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.HadoopTableResource; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.FlinkSplitGenerator; +import org.apache.iceberg.flink.source.Position; +import org.apache.iceberg.flink.source.ScanContext; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.hamcrest.CoreMatchers; +import org.junit.Assert; +import org.junit.Before; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +@RunWith(Parameterized.class) +public abstract class ReaderFunctionTestBase { + + @Parameterized.Parameters(name = "fileFormat={0}") + public static Object[][] parameters() { + return new Object[][]{ + new Object[]{FileFormat.AVRO}, + new Object[]{FileFormat.ORC}, + new Object[]{FileFormat.PARQUET} + }; + } + + @ClassRule + public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + + protected static final ScanContext scanContext = ScanContext.builder() + .project(TestFixtures.SCHEMA) + .build(); + + @Rule + public final HadoopTableResource tableResource = new HadoopTableResource(TEMPORARY_FOLDER, + TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); + + protected abstract ReaderFunction readerFunction(); + + protected abstract void assertRecords(List expected, List actual, Schema schema); + + private final FileFormat fileFormat; + + public ReaderFunctionTestBase(FileFormat fileFormat) { + this.fileFormat = fileFormat; + } + + private List> recordBatchList; + private List dataFileList; + private IcebergSourceSplit icebergSplit; + + @Before + public void before() throws IOException { + final GenericAppenderHelper dataAppender = new GenericAppenderHelper( + tableResource.table(), fileFormat, TEMPORARY_FOLDER); + recordBatchList = new ArrayList<>(3); + dataFileList = new ArrayList<>(2); + for (int i = 0; i < 3; ++i) { + List records = RandomGenericData.generate(TestFixtures.SCHEMA, 2, i); + recordBatchList.add(records); + DataFile dataFile = dataAppender.writeFile(null, records); + dataFileList.add(dataFile); + dataAppender.appendToTable(dataFile); + } + + final List splits = FlinkSplitGenerator + .planIcebergSourceSplits(tableResource.table(), scanContext); + Assert.assertEquals(1, splits.size()); + Assert.assertEquals(3, splits.get(0).task().files().size()); + icebergSplit = sortFilesAsAppendOrder(splits.get(0), dataFileList); + } + + /** + * Split planning doesn't guarantee the order is the same as appended. + * So we re-arrange the list to make the assertion simpler + */ + public static IcebergSourceSplit sortFilesAsAppendOrder(IcebergSourceSplit split, List dataFiles) { + Collection files = split.task().files(); + Assert.assertEquals(files.size(), dataFiles.size()); + FileScanTask[] sortedFileArray = new FileScanTask[files.size()]; + for (FileScanTask fileScanTask : files) { + for (int i = 0; i < dataFiles.size(); ++i) { + if (fileScanTask.file().path().toString().equals(dataFiles.get(i).path().toString())) { + sortedFileArray[i] = fileScanTask; + } + } + } + List sortedFileList = Lists.newArrayList(sortedFileArray); + Assert.assertThat(sortedFileList, CoreMatchers.everyItem(CoreMatchers.notNullValue(FileScanTask.class))); + CombinedScanTask rearrangedCombinedTask = new BaseCombinedScanTask(sortedFileList); + return IcebergSourceSplit.fromCombinedScanTask(rearrangedCombinedTask); + } + + /** + * We have to combine the record extraction and position assertion in a single function, + * because iterator is only valid for one pass. + */ + private List extractRecordsAndAssertPosition( + RecordsWithSplitIds> batch, + long expectedCount, long exptectedFileOffset, long startRecordOffset) { + // need to call nextSplit first in order to read the batch + batch.nextSplit(); + final List records = new ArrayList<>(); + long recordOffset = startRecordOffset; + RecordAndPosition recordAndPosition; + while ((recordAndPosition = batch.nextRecordFromSplit()) != null) { + records.add(recordAndPosition.getRecord()); + Assert.assertEquals("expected file offset", exptectedFileOffset, recordAndPosition.getOffset()); + Assert.assertEquals("expected record offset", recordOffset, recordAndPosition.getRecordSkipCount() - 1); + recordOffset++; + } + Assert.assertEquals("expected record count", expectedCount, records.size()); + return records; + } + + @Test + public void testNoCheckpointedPosition() throws IOException { + final IcebergSourceSplit split = icebergSplit; + final CloseableIterator>> reader = readerFunction().apply(split); + + final RecordsWithSplitIds> batch0 = reader.next(); + final List actual0 = extractRecordsAndAssertPosition(batch0, recordBatchList.get(0).size(), 0L, 0L); + assertRecords(recordBatchList.get(0), actual0, TestFixtures.SCHEMA); + batch0.recycle(); + + final RecordsWithSplitIds> batch1 = reader.next(); + final List actual1 = extractRecordsAndAssertPosition(batch1, recordBatchList.get(1).size(), 1L, 0L); + assertRecords(recordBatchList.get(1), actual1, TestFixtures.SCHEMA); + batch1.recycle(); + + final RecordsWithSplitIds> batch2 = reader.next(); + final List actual2 = extractRecordsAndAssertPosition(batch2, recordBatchList.get(2).size(), 2L, 0L); + assertRecords(recordBatchList.get(2), actual2, TestFixtures.SCHEMA); + batch2.recycle(); + } + + @Test + public void testCheckpointedPositionBeforeFirstFile() throws IOException { + final IcebergSourceSplit split = new IcebergSourceSplit( + icebergSplit.task(), + new Position(0L, 0L)); + final CloseableIterator>> reader = readerFunction().apply(split); + + final RecordsWithSplitIds> batch0 = reader.next(); + final List actual0 = extractRecordsAndAssertPosition(batch0, recordBatchList.get(0).size(), 0L, 0L); + assertRecords(recordBatchList.get(0), actual0, TestFixtures.SCHEMA); + batch0.recycle(); + + final RecordsWithSplitIds> batch1 = reader.next(); + final List actual1 = extractRecordsAndAssertPosition(batch1, recordBatchList.get(1).size(), 1L, 0L); + assertRecords(recordBatchList.get(1), actual1, TestFixtures.SCHEMA); + batch1.recycle(); + + final RecordsWithSplitIds> batch2 = reader.next(); + final List actual2 = extractRecordsAndAssertPosition(batch2, recordBatchList.get(2).size(), 2L, 0L); + assertRecords(recordBatchList.get(2), actual2, TestFixtures.SCHEMA); + batch2.recycle(); + } + + @Test + public void testCheckpointedPositionMiddleFirstFile() throws IOException { + final IcebergSourceSplit split = new IcebergSourceSplit( + icebergSplit.task(), + new Position(0L, 1L)); + final CloseableIterator>> reader = readerFunction().apply(split); + + final RecordsWithSplitIds> batch0 = reader.next(); + final List actual0 = extractRecordsAndAssertPosition(batch0, 1L, 0L, 1L); + assertRecords(recordBatchList.get(0).subList(1, 2), actual0, TestFixtures.SCHEMA); + batch0.recycle(); + + final RecordsWithSplitIds> batch1 = reader.next(); + final List actual1 = extractRecordsAndAssertPosition(batch1, recordBatchList.get(1).size(), 1L, 0L); + assertRecords(recordBatchList.get(1), actual1, TestFixtures.SCHEMA); + batch1.recycle(); + + final RecordsWithSplitIds> batch2 = reader.next(); + final List actual2 = extractRecordsAndAssertPosition(batch2, recordBatchList.get(2).size(), 2L, 0L); + assertRecords(recordBatchList.get(2), actual2, TestFixtures.SCHEMA); + batch2.recycle(); + } + + @Test + public void testCheckpointedPositionAfterFirstFile() throws IOException { + final IcebergSourceSplit split = new IcebergSourceSplit( + icebergSplit.task(), + new Position(0L, 2L)); + final CloseableIterator>> reader = readerFunction().apply(split); + + final RecordsWithSplitIds> batch0 = reader.next(); + final List actual1 = extractRecordsAndAssertPosition(batch0, recordBatchList.get(1).size(), 1L, 0L); + assertRecords(recordBatchList.get(1), actual1, TestFixtures.SCHEMA); + batch0.recycle(); + + final RecordsWithSplitIds> batch2 = reader.next(); + final List actual2 = extractRecordsAndAssertPosition(batch2, recordBatchList.get(2).size(), 2L, 0L); + assertRecords(recordBatchList.get(2), actual2, TestFixtures.SCHEMA); + batch2.recycle(); + } + + @Test + public void testCheckpointedPositionBeforeSecondFile() throws IOException { + final IcebergSourceSplit split = new IcebergSourceSplit( + icebergSplit.task(), + new Position(1L, 0L)); + final CloseableIterator>> reader = readerFunction().apply(split); + + final RecordsWithSplitIds> batch1 = reader.next(); + final List actual1 = extractRecordsAndAssertPosition(batch1, recordBatchList.get(1).size(), 1L, 0L); + assertRecords(recordBatchList.get(1), actual1, TestFixtures.SCHEMA); + batch1.recycle(); + + final RecordsWithSplitIds> batch2 = reader.next(); + final List actual2 = extractRecordsAndAssertPosition(batch2, recordBatchList.get(2).size(), 2L, 0L); + assertRecords(recordBatchList.get(2), actual2, TestFixtures.SCHEMA); + batch2.recycle(); + } + + @Test + public void testCheckpointedPositionMidSecondFile() throws IOException { + final IcebergSourceSplit split = new IcebergSourceSplit( + icebergSplit.task(), + new Position(1L, 1L)); + final CloseableIterator>> reader = readerFunction().apply(split); + + final RecordsWithSplitIds> batch1 = reader.next(); + final List actual1 = extractRecordsAndAssertPosition(batch1, 1L, 1L, 1L); + assertRecords(recordBatchList.get(1).subList(1, 2), actual1, TestFixtures.SCHEMA); + batch1.recycle(); + + final RecordsWithSplitIds> batch2 = reader.next(); + final List actual2 = extractRecordsAndAssertPosition(batch2, recordBatchList.get(2).size(), 2L, 0L); + assertRecords(recordBatchList.get(2), actual2, TestFixtures.SCHEMA); + batch2.recycle(); + } + +} diff --git a/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceSplitReader.java b/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceSplitReader.java new file mode 100644 index 000000000000..e83d1d61dd35 --- /dev/null +++ b/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceSplitReader.java @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.flink.source.reader; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.base.source.reader.splitreader.SplitsAddition; +import org.apache.flink.connector.file.src.util.RecordAndPosition; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.types.Row; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.HadoopTableResource; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.flink.source.FlinkSplitGenerator; +import org.apache.iceberg.flink.source.ScanContext; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.rules.RuleChain; +import org.junit.rules.TemporaryFolder; +import org.junit.rules.TestRule; + +public class TestIcebergSourceSplitReader { + + public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + public static final HadoopTableResource tableResource = new HadoopTableResource(TEMPORARY_FOLDER, + TestFixtures.DATABASE, TestFixtures.TABLE, TestFixtures.SCHEMA); + + @ClassRule + public static final TestRule chain = RuleChain + .outerRule(TEMPORARY_FOLDER) + .around(tableResource); + + private static final ScanContext scanContext = ScanContext.builder() + .project(TestFixtures.SCHEMA) + .build(); + private static final FileFormat fileFormat = FileFormat.PARQUET; + + private static List> recordBatchList; + private static List dataFileList; + private static IcebergSourceSplit icebergSplit; + + @BeforeClass + public static void beforeClass() throws IOException { + final GenericAppenderHelper dataAppender = new GenericAppenderHelper( + tableResource.table(), fileFormat, TEMPORARY_FOLDER); + recordBatchList = new ArrayList<>(3); + dataFileList = new ArrayList<>(2); + for (int i = 0; i < 3; ++i) { + List records = RandomGenericData.generate(TestFixtures.SCHEMA, 2, i); + recordBatchList.add(records); + DataFile dataFile = dataAppender.writeFile(null, records); + dataFileList.add(dataFile); + dataAppender.appendToTable(dataFile); + } + + final List splits = FlinkSplitGenerator + .planIcebergSourceSplits(tableResource.table(), scanContext); + Assert.assertEquals(1, splits.size()); + Assert.assertEquals(3, splits.get(0).task().files().size()); + icebergSplit = ReaderFunctionTestBase.sortFilesAsAppendOrder(splits.get(0), dataFileList); + } + + @Test + public void testFullScan() throws Exception { + final IcebergSourceSplit split = icebergSplit; + final Configuration config = new Configuration(); + RowType rowType = FlinkSchemaUtil.convert(tableResource.table().schema()); + IcebergSourceSplitReader reader = new IcebergSourceSplitReader( + new RowDataIteratorReaderFunction(config, tableResource.table(), scanContext, rowType)); + reader.handleSplitsChanges(new SplitsAddition(Arrays.asList(split))); + + final RecordsWithSplitIds> readBatch0 = reader.fetch(); + final List rowBatch0 = readRows(readBatch0, split.splitId(), 0L, 0L); + TestHelpers.assertRecords(rowBatch0, recordBatchList.get(0), TestFixtures.SCHEMA); + + final RecordsWithSplitIds> readBatch1 + = reader.fetch(); + final List rowBatch1 = readRows(readBatch1, split.splitId(), 1L, 0L); + TestHelpers.assertRecords(rowBatch1, recordBatchList.get(1), TestFixtures.SCHEMA); + + final RecordsWithSplitIds> readBatch2 = reader.fetch(); + final List rowBatch2 = readRows(readBatch2, split.splitId(), 2L, 0L); + TestHelpers.assertRecords(rowBatch2, recordBatchList.get(2), TestFixtures.SCHEMA); + + final RecordsWithSplitIds> finishedBatch = reader.fetch(); + Assert.assertEquals(Sets.newHashSet(split.splitId()), finishedBatch.finishedSplits()); + Assert.assertEquals(null, finishedBatch.nextSplit()); + } + + @Test + public void testResumeFromEndOfFirstBatch() throws Exception { + final IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(icebergSplit.task(), 0L, 2L); + final Configuration config = new Configuration(); + RowType rowType = FlinkSchemaUtil.convert(tableResource.table().schema()); + IcebergSourceSplitReader reader = new IcebergSourceSplitReader( + new RowDataIteratorReaderFunction(config, tableResource.table(), scanContext, rowType)); + reader.handleSplitsChanges(new SplitsAddition(Arrays.asList(split))); + + final RecordsWithSplitIds> readBatch1 = reader.fetch(); + final List rowBatch1 = readRows(readBatch1, split.splitId(), 1L, 0L); + TestHelpers.assertRecords(rowBatch1, recordBatchList.get(1), TestFixtures.SCHEMA); + + final RecordsWithSplitIds> readBatch2 = reader.fetch(); + final List rowBatch2 = readRows(readBatch2, split.splitId(), 2L, 0L); + TestHelpers.assertRecords(rowBatch2, recordBatchList.get(2), TestFixtures.SCHEMA); + + final RecordsWithSplitIds> finishedBatch = reader.fetch(); + Assert.assertEquals(Sets.newHashSet(split.splitId()), finishedBatch.finishedSplits()); + Assert.assertEquals(null, finishedBatch.nextSplit()); + } + + @Test + public void testResumeFromStartOfSecondBatch() throws Exception { + final IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(icebergSplit.task(), 1L, 0L); + final Configuration config = new Configuration(); + RowType rowType = FlinkSchemaUtil.convert(tableResource.table().schema()); + IcebergSourceSplitReader reader = new IcebergSourceSplitReader( + new RowDataIteratorReaderFunction(config, tableResource.table(), scanContext, rowType)); + reader.handleSplitsChanges(new SplitsAddition(Arrays.asList(split))); + + final RecordsWithSplitIds> readBatch1 = reader.fetch(); + final List rowBatch1 = readRows(readBatch1, split.splitId(), 1L, 0L); + TestHelpers.assertRecords(rowBatch1, recordBatchList.get(1), TestFixtures.SCHEMA); + + final RecordsWithSplitIds> readBatch2 = reader.fetch(); + final List rowBatch2 = readRows(readBatch2, split.splitId(), 2L, 0L); + TestHelpers.assertRecords(rowBatch2, recordBatchList.get(2), TestFixtures.SCHEMA); + + final RecordsWithSplitIds> finishedBatch + = reader.fetch(); + Assert.assertEquals(Sets.newHashSet(split.splitId()), finishedBatch.finishedSplits()); + Assert.assertEquals(null, finishedBatch.nextSplit()); + } + + @Test + public void testResumeFromMiddleOfSecondBatch() throws Exception { + final IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(icebergSplit.task(), 1L, 1L); + + final Configuration config = new Configuration(); + RowType rowType = FlinkSchemaUtil.convert(tableResource.table().schema()); + IcebergSourceSplitReader reader = new IcebergSourceSplitReader( + new RowDataIteratorReaderFunction(config, tableResource.table(), scanContext, rowType)); + reader.handleSplitsChanges(new SplitsAddition(Arrays.asList(split))); + + final RecordsWithSplitIds> readBatch1 = reader.fetch(); + final List rowBatch1 = readRows(readBatch1, split.splitId(), 1L, 1L); + TestHelpers.assertRecords(rowBatch1, recordBatchList.get(1).subList(1, 2), TestFixtures.SCHEMA); + + final RecordsWithSplitIds> readBatch2 = reader.fetch(); + final List rowBatch2 = readRows(readBatch2, split.splitId(), 2L, 0L); + TestHelpers.assertRecords(rowBatch2, recordBatchList.get(2), TestFixtures.SCHEMA); + + final RecordsWithSplitIds> finishedBatch + = reader.fetch(); + Assert.assertEquals(Sets.newHashSet(split.splitId()), finishedBatch.finishedSplits()); + Assert.assertEquals(null, finishedBatch.nextSplit()); + } + + private List readRows( + RecordsWithSplitIds> readBatch, + String expectedSplitId, long expectedOffset, long expectedStartingRecordOffset) { + Assert.assertEquals(expectedSplitId, readBatch.nextSplit()); + final List rowDataList = new ArrayList<>(); + RecordAndPosition row; + int num = 0; + while ((row = readBatch.nextRecordFromSplit()) != null) { + Assert.assertEquals(expectedOffset, row.getOffset()); + num++; + Assert.assertEquals(expectedStartingRecordOffset + num, row.getRecordSkipCount()); + rowDataList.add(row.getRecord()); + } + readBatch.recycle(); + return TestHelpers.convertRowDataToRow(rowDataList, TestFixtures.ROW_TYPE); + } + +} diff --git a/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRecyclableArrayIterator.java b/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRecyclableArrayIterator.java new file mode 100644 index 000000000000..eb3503ed5371 --- /dev/null +++ b/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRecyclableArrayIterator.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.flink.source.reader; + +import java.util.concurrent.atomic.AtomicBoolean; +import org.apache.flink.connector.file.src.util.RecordAndPosition; +import org.junit.Assert; +import org.junit.Test; + +public class TestRecyclableArrayIterator { + + @Test + public void testEmptyConstruction() { + // dummy recycler + final RecyclableArrayIterator iter = new RecyclableArrayIterator<>( + ignored -> System.currentTimeMillis()); + Assert.assertNull(iter.next()); + } + + @Test + public void testGetElements() { + final String[] elements = new String[]{"1", "2", "3", "4"}; + final long initialOffset = 3; + final long initialSkipCount = 17; + + // dummy recycler + final RecyclableArrayIterator iter = new RecyclableArrayIterator<>( + ignored -> System.currentTimeMillis(), elements, elements.length, initialOffset, initialSkipCount); + + for (int i = 0; i < elements.length; i++) { + final RecordAndPosition recAndPos = iter.next(); + Assert.assertEquals(elements[i], recAndPos.getRecord()); + Assert.assertEquals(initialOffset, recAndPos.getOffset()); + Assert.assertEquals(initialSkipCount + i + 1, recAndPos.getRecordSkipCount()); + } + } + + @Test + public void testExhausted() { + // dummy recycler + final RecyclableArrayIterator iter = new RecyclableArrayIterator<>( + ignored -> System.currentTimeMillis(), new String[]{"1", "2"}, 2, 0L, 0L); + + iter.next(); + iter.next(); + + Assert.assertNull(iter.next()); + } + + @Test + public void testArraySubRange() { + // dummy recycler + final RecyclableArrayIterator iter = new RecyclableArrayIterator<>(ignored -> System.currentTimeMillis(), + new String[]{"1", "2", "3"}, 2, 0L, 0L); + + Assert.assertNotNull(iter.next()); + Assert.assertNotNull(iter.next()); + Assert.assertNull(iter.next()); + } + + @Test + public void testRecycler() { + final AtomicBoolean recycled = new AtomicBoolean(); + final RecyclableArrayIterator iter = new RecyclableArrayIterator<>(ignored -> recycled.set(true)); + iter.close(); + Assert.assertTrue(recycled.get()); + } +} diff --git a/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataIteratorReaderFunction.java b/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataIteratorReaderFunction.java new file mode 100644 index 000000000000..247b788403be --- /dev/null +++ b/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataIteratorReaderFunction.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.flink.source.reader; + +import java.util.List; +import java.util.stream.Collectors; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.conversion.DataStructureConverter; +import org.apache.flink.table.data.conversion.DataStructureConverters; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.utils.TypeConversions; +import org.apache.flink.types.Row; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.TestHelpers; + +public class TestRowDataIteratorReaderFunction extends ReaderFunctionTestBase { + + protected static final RowType rowType = FlinkSchemaUtil + .convert(scanContext.project()); + private static final DataStructureConverter rowDataConverter = DataStructureConverters.getConverter( + TypeConversions.fromLogicalToDataType(rowType)); + private static final org.apache.flink.configuration.Configuration flinkConfig = + new org.apache.flink.configuration.Configuration(); + + public TestRowDataIteratorReaderFunction(FileFormat fileFormat) { + super(fileFormat); + } + + @Override + protected ReaderFunction readerFunction() { + return new RowDataIteratorReaderFunction(new Configuration(), tableResource.table(), scanContext, rowType); + } + + @Override + protected void assertRecords(List expected, List actual, Schema schema) { + final List rows = toRows(actual); + TestHelpers.assertRecords(rows, expected, TestFixtures.SCHEMA); + } + + private List toRows(List actual) { + return actual.stream() + .map(rowData -> (Row) rowDataConverter.toExternal(rowData)) + .collect(Collectors.toList()); + } +} diff --git a/flink/src/test/java/org/apache/iceberg/flink/source/split/SplitHelpers.java b/flink/src/test/java/org/apache/iceberg/flink/source/split/SplitHelpers.java new file mode 100644 index 000000000000..91cf7c0526c6 --- /dev/null +++ b/flink/src/test/java/org/apache/iceberg/flink/source/split/SplitHelpers.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.flink.source.split; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; +import org.apache.iceberg.BaseCombinedScanTask; +import org.apache.iceberg.CombinedScanTask; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.MockFileScanTask; +import org.apache.iceberg.Table; +import org.apache.iceberg.data.GenericAppenderHelper; +import org.apache.iceberg.data.RandomGenericData; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.FlinkSplitGenerator; +import org.apache.iceberg.flink.source.ScanContext; +import org.apache.iceberg.hadoop.HadoopCatalog; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.Assert; +import org.junit.rules.TemporaryFolder; + +public class SplitHelpers { + + private static final AtomicLong splitLengthIncrement = new AtomicLong(); + + private SplitHelpers() { + + } + + public static List createMockedSplits(int splitCount) { + final List splits = new ArrayList<>(); + for (int i = 0; i < splitCount; ++i) { + // make sure each task has a different length, + // as it is part of the splitId calculation. + // This way, we can make sure all generated splits have different splitIds + FileScanTask fileScanTask = new MockFileScanTask(1024 + splitLengthIncrement.incrementAndGet()); + CombinedScanTask combinedScanTask = new BaseCombinedScanTask(fileScanTask); + splits.add(IcebergSourceSplit.fromCombinedScanTask(combinedScanTask)); + } + return splits; + } + + public static List createFileSplits( + TemporaryFolder temporaryFolder, int fileCount, int filesPerSplit) throws Exception { + final File warehouseFile = temporaryFolder.newFolder(); + Assert.assertTrue(warehouseFile.delete()); + final String warehouse = "file:" + warehouseFile; + org.apache.hadoop.conf.Configuration hadoopConf = new org.apache.hadoop.conf.Configuration(); + final HadoopCatalog catalog = new HadoopCatalog(hadoopConf, warehouse); + try { + final Table table = catalog.createTable(TestFixtures.TABLE_IDENTIFIER, TestFixtures.SCHEMA); + final GenericAppenderHelper dataAppender = new GenericAppenderHelper( + table, FileFormat.PARQUET, temporaryFolder); + for (int i = 0; i < fileCount; ++i) { + List records = RandomGenericData.generate(TestFixtures.SCHEMA, 2, i); + dataAppender.appendToTable(records); + } + + final ScanContext scanContext = ScanContext.builder().build(); + final List splits = FlinkSplitGenerator.planIcebergSourceSplits(table, scanContext); + return splits.stream() + .flatMap(split -> { + List> filesList = Lists.partition(new ArrayList<>(split.task().files()), filesPerSplit); + return filesList.stream() + .map(files -> new BaseCombinedScanTask(files)) + .map(combinedScanTask -> IcebergSourceSplit.fromCombinedScanTask(combinedScanTask)); + }) + .collect(Collectors.toList()); + } finally { + catalog.dropTable(TestFixtures.TABLE_IDENTIFIER); + catalog.close(); + } + } +} diff --git a/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java b/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java new file mode 100644 index 000000000000..38b358ec55a7 --- /dev/null +++ b/flink/src/test/java/org/apache/iceberg/flink/source/split/TestIcebergSourceSplitSerializer.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.flink.source.split; + +import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; +import org.apache.iceberg.flink.source.Position; +import org.junit.Assert; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +public class TestIcebergSourceSplitSerializer { + + @ClassRule + public static final TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + + private final IcebergSourceSplitSerializer serializer = IcebergSourceSplitSerializer.INSTANCE; + + @Test + public void testLatestVersion() throws Exception { + serializeAndDeserialize(1, 1); + serializeAndDeserialize(10, 2); + } + + private void serializeAndDeserialize(int splitCount, int filesPerSplit) throws Exception { + final List splits = SplitHelpers.createFileSplits(TEMPORARY_FOLDER, splitCount, filesPerSplit); + for (IcebergSourceSplit split : splits) { + final byte[] result = serializer.serialize(split); + final IcebergSourceSplit deserialized = serializer.deserialize(serializer.getVersion(), result); + Assert.assertEquals(split, deserialized); + + final byte[] cachedResult = serializer.serialize(split); + Assert.assertSame(result, cachedResult); + final IcebergSourceSplit deserialized2 = serializer.deserialize(serializer.getVersion(), cachedResult); + Assert.assertEquals(split, deserialized2); + } + } + + @Test + public void testV1() throws Exception { + serializeAndDeserializeV1(1, 1); + serializeAndDeserializeV1(10, 2); + } + + private void serializeAndDeserializeV1(int splitCount, int filesPerSplit) throws Exception { + final List splits = SplitHelpers.createFileSplits(TEMPORARY_FOLDER, splitCount, filesPerSplit); + for (IcebergSourceSplit split : splits) { + final byte[] result = serializer.serializeV1(split); + final IcebergSourceSplit deserialized = serializer.deserializeV1(result); + Assert.assertEquals(split, deserialized); + } + } + + @Test + public void testCheckpointedPosition() throws Exception { + final AtomicInteger index = new AtomicInteger(); + final List splits = SplitHelpers.createFileSplits(TEMPORARY_FOLDER, 10, 2).stream() + .map(split -> { + final IcebergSourceSplit result; + if (index.get() % 2 == 0) { + result = new IcebergSourceSplit(split.task(), new Position(index.get(), index.get())); + } else { + result = split; + } + index.incrementAndGet(); + return result; + }) + .collect(Collectors.toList()); + + for (IcebergSourceSplit split : splits) { + final byte[] result = serializer.serialize(split); + final IcebergSourceSplit deserialized = serializer.deserialize(serializer.getVersion(), result); + Assert.assertEquals(split, deserialized); + + final byte[] cachedResult = serializer.serialize(split); + Assert.assertSame(result, cachedResult); + final IcebergSourceSplit deserialized2 = serializer.deserialize(serializer.getVersion(), cachedResult); + Assert.assertEquals(split, deserialized2); + } + } +} From a0037dbb0e4e01bfe64f2e5e2bd4b36f21755ee1 Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Wed, 8 Sep 2021 09:52:25 -0700 Subject: [PATCH 02/14] whitespace change --- .../apache/iceberg/flink/source/RowDataFileScanTaskReader.java | 1 - .../flink/source/reader/ArrayPoolDataIteratorBatcher.java | 1 - .../iceberg/flink/source/reader/DataIteratorBatcher.java | 2 -- .../flink/source/reader/DataIteratorReaderFunction.java | 3 +-- .../flink/source/reader/IcebergSourceReaderMetrics.java | 1 - .../iceberg/flink/source/reader/IcebergSourceSplitReader.java | 1 - .../org/apache/iceberg/flink/source/reader/ReaderFunction.java | 2 -- .../org/apache/iceberg/flink/source/reader/RecordFactory.java | 1 - .../iceberg/flink/source/reader/RecyclableArrayIterator.java | 1 - .../flink/source/reader/RowDataIteratorReaderFunction.java | 1 - .../iceberg/flink/source/reader/RowDataRecordFactory.java | 1 - .../apache/iceberg/flink/source/split/IcebergSourceSplit.java | 2 -- .../flink/source/split/IcebergSourceSplitSerializer.java | 2 -- 13 files changed, 1 insertion(+), 18 deletions(-) diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java b/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java index fbdb7bf3cc02..dbe8a747f96b 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java @@ -48,7 +48,6 @@ @Internal public class RowDataFileScanTaskReader implements FileScanTaskReader { - private final Schema tableSchema; private final Schema projectedSchema; private final String nameMapping; diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java index 6b327898a8b1..fc03fc4683fb 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java @@ -32,7 +32,6 @@ import org.apache.iceberg.io.CloseableIterator; class ArrayPoolDataIteratorBatcher implements DataIteratorBatcher { - private final Configuration config; private final RecordFactory recordFactory; diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java index a296517a1846..d2784f253279 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java @@ -27,7 +27,5 @@ @FunctionalInterface public interface DataIteratorBatcher extends Serializable { - CloseableIterator>> apply(String splitId, DataIterator inputIterator); - } diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java index e3e6bfdc0394..364879401316 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java @@ -26,10 +26,9 @@ import org.apache.iceberg.io.CloseableIterator; public abstract class DataIteratorReaderFunction implements ReaderFunction { - private final DataIteratorBatcher batcher; - DataIteratorReaderFunction(DataIteratorBatcher batcher) { + public DataIteratorReaderFunction(DataIteratorBatcher batcher) { this.batcher = batcher; } diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java index a2aa7d518731..cb81d54bab90 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java @@ -24,7 +24,6 @@ import org.apache.flink.metrics.MetricGroup; public class IcebergSourceReaderMetrics { - private final AtomicLong numRecordsOut; private final AtomicLong assignedSplits; private final AtomicLong finishedSplits; diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java index 9c68b6f6b4ac..359a352864d1 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java @@ -40,7 +40,6 @@ class IcebergSourceSplitReader implements SplitReader, I private final ReaderFunction readerFunction; private final int indexOfSubtask; private final IcebergSourceReaderMetrics metrics; - private final Queue splits; @Nullable diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java index bd6fd097444b..c3a32bc0e05d 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java @@ -26,8 +26,6 @@ import org.apache.iceberg.io.CloseableIterator; public interface ReaderFunction extends Serializable { - CloseableIterator>> read(IcebergSourceSplit split); - } diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java index 82deda46018a..c006558e8de4 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordFactory.java @@ -22,7 +22,6 @@ import java.io.Serializable; interface RecordFactory extends Serializable { - /** * Create a batch of records */ diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecyclableArrayIterator.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecyclableArrayIterator.java index 63e657b12629..e0bd19ed3f76 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecyclableArrayIterator.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecyclableArrayIterator.java @@ -32,7 +32,6 @@ * Main difference is the records array can be recycled back to a pool. */ final class RecyclableArrayIterator implements CloseableIterator> { - private final Pool.Recycler recycler; private final E[] records; private final int num; diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataIteratorReaderFunction.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataIteratorReaderFunction.java index 92e054c0e13b..c90df52f349e 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataIteratorReaderFunction.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataIteratorReaderFunction.java @@ -29,7 +29,6 @@ import org.apache.iceberg.flink.source.split.IcebergSourceSplit; public class RowDataIteratorReaderFunction extends DataIteratorReaderFunction { - private final Table table; private final ScanContext scanContext; diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java index f2ff28cab222..3e46574cc9fa 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataRecordFactory.java @@ -27,7 +27,6 @@ import org.apache.iceberg.flink.data.RowDataUtil; class RowDataRecordFactory implements RecordFactory { - private final RowType rowType; private final TypeSerializer[] fieldSerializers; diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java b/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java index 02692dd86668..d8d13090cca6 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java @@ -32,9 +32,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Iterables; public class IcebergSourceSplit implements SourceSplit, Serializable { - private final CombinedScanTask task; - /** * Position field is mutable */ diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java b/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java index dcac972bd06f..86aa82fb1656 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java @@ -29,9 +29,7 @@ * will switch to more stable serializer from issue-1698. */ public class IcebergSourceSplitSerializer implements SimpleVersionedSerializer { - public static final IcebergSourceSplitSerializer INSTANCE = new IcebergSourceSplitSerializer(); - private static final int VERSION = 1; @Override From ef0b9377ac62b40de4f86fd77121c0ff6ab48eef Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Wed, 8 Sep 2021 11:17:24 -0700 Subject: [PATCH 03/14] rename class --- ...nction.java => RowDataReaderFunction.java} | 4 +-- .../reader/TestIcebergSourceSplitReader.java | 36 ++++++++++--------- ...on.java => TestRowDataReaderFunction.java} | 6 ++-- 3 files changed, 24 insertions(+), 22 deletions(-) rename flink/src/main/java/org/apache/iceberg/flink/source/reader/{RowDataIteratorReaderFunction.java => RowDataReaderFunction.java} (93%) rename flink/src/test/java/org/apache/iceberg/flink/source/reader/{TestRowDataIteratorReaderFunction.java => TestRowDataReaderFunction.java} (90%) diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataIteratorReaderFunction.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java similarity index 93% rename from flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataIteratorReaderFunction.java rename to flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java index c90df52f349e..907675543bd0 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataIteratorReaderFunction.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java @@ -28,11 +28,11 @@ import org.apache.iceberg.flink.source.ScanContext; import org.apache.iceberg.flink.source.split.IcebergSourceSplit; -public class RowDataIteratorReaderFunction extends DataIteratorReaderFunction { +public class RowDataReaderFunction extends DataIteratorReaderFunction { private final Table table; private final ScanContext scanContext; - public RowDataIteratorReaderFunction( + public RowDataReaderFunction( Configuration config, Table table, ScanContext scanContext, diff --git a/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceSplitReader.java b/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceSplitReader.java index e83d1d61dd35..9662a04dbd49 100644 --- a/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceSplitReader.java +++ b/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceSplitReader.java @@ -27,6 +27,8 @@ import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; import org.apache.flink.connector.base.source.reader.splitreader.SplitsAddition; import org.apache.flink.connector.file.src.util.RecordAndPosition; +import org.apache.flink.connector.testutils.source.reader.TestingReaderContext; +import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.logical.RowType; import org.apache.flink.types.Row; @@ -44,6 +46,7 @@ import org.apache.iceberg.flink.source.split.IcebergSourceSplit; import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.junit.Assert; +import org.junit.Before; import org.junit.BeforeClass; import org.junit.ClassRule; import org.junit.Test; @@ -71,6 +74,8 @@ public class TestIcebergSourceSplitReader { private static List dataFileList; private static IcebergSourceSplit icebergSplit; + private IcebergSourceSplitReader reader; + @BeforeClass public static void beforeClass() throws IOException { final GenericAppenderHelper dataAppender = new GenericAppenderHelper( @@ -92,13 +97,23 @@ public static void beforeClass() throws IOException { icebergSplit = ReaderFunctionTestBase.sortFilesAsAppendOrder(splits.get(0), dataFileList); } + @Before + public void before() { + reader = createSplitReader(); + } + + private IcebergSourceSplitReader createSplitReader() { + final Configuration config = new Configuration(); + RowType rowType = FlinkSchemaUtil.convert(tableResource.table().schema()); + return new IcebergSourceSplitReader( + new RowDataReaderFunction(config, tableResource.table(), scanContext, rowType), + new TestingReaderContext(), + new IcebergSourceReaderMetrics(new UnregisteredMetricsGroup())); + } + @Test public void testFullScan() throws Exception { final IcebergSourceSplit split = icebergSplit; - final Configuration config = new Configuration(); - RowType rowType = FlinkSchemaUtil.convert(tableResource.table().schema()); - IcebergSourceSplitReader reader = new IcebergSourceSplitReader( - new RowDataIteratorReaderFunction(config, tableResource.table(), scanContext, rowType)); reader.handleSplitsChanges(new SplitsAddition(Arrays.asList(split))); final RecordsWithSplitIds> readBatch0 = reader.fetch(); @@ -122,10 +137,6 @@ public void testFullScan() throws Exception { @Test public void testResumeFromEndOfFirstBatch() throws Exception { final IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(icebergSplit.task(), 0L, 2L); - final Configuration config = new Configuration(); - RowType rowType = FlinkSchemaUtil.convert(tableResource.table().schema()); - IcebergSourceSplitReader reader = new IcebergSourceSplitReader( - new RowDataIteratorReaderFunction(config, tableResource.table(), scanContext, rowType)); reader.handleSplitsChanges(new SplitsAddition(Arrays.asList(split))); final RecordsWithSplitIds> readBatch1 = reader.fetch(); @@ -144,10 +155,6 @@ public void testResumeFromEndOfFirstBatch() throws Exception { @Test public void testResumeFromStartOfSecondBatch() throws Exception { final IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(icebergSplit.task(), 1L, 0L); - final Configuration config = new Configuration(); - RowType rowType = FlinkSchemaUtil.convert(tableResource.table().schema()); - IcebergSourceSplitReader reader = new IcebergSourceSplitReader( - new RowDataIteratorReaderFunction(config, tableResource.table(), scanContext, rowType)); reader.handleSplitsChanges(new SplitsAddition(Arrays.asList(split))); final RecordsWithSplitIds> readBatch1 = reader.fetch(); @@ -167,11 +174,6 @@ public void testResumeFromStartOfSecondBatch() throws Exception { @Test public void testResumeFromMiddleOfSecondBatch() throws Exception { final IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(icebergSplit.task(), 1L, 1L); - - final Configuration config = new Configuration(); - RowType rowType = FlinkSchemaUtil.convert(tableResource.table().schema()); - IcebergSourceSplitReader reader = new IcebergSourceSplitReader( - new RowDataIteratorReaderFunction(config, tableResource.table(), scanContext, rowType)); reader.handleSplitsChanges(new SplitsAddition(Arrays.asList(split))); final RecordsWithSplitIds> readBatch1 = reader.fetch(); diff --git a/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataIteratorReaderFunction.java b/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java similarity index 90% rename from flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataIteratorReaderFunction.java rename to flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java index 247b788403be..558bd2785552 100644 --- a/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataIteratorReaderFunction.java +++ b/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java @@ -35,7 +35,7 @@ import org.apache.iceberg.flink.TestFixtures; import org.apache.iceberg.flink.TestHelpers; -public class TestRowDataIteratorReaderFunction extends ReaderFunctionTestBase { +public class TestRowDataReaderFunction extends ReaderFunctionTestBase { protected static final RowType rowType = FlinkSchemaUtil .convert(scanContext.project()); @@ -44,13 +44,13 @@ public class TestRowDataIteratorReaderFunction extends ReaderFunctionTestBase readerFunction() { - return new RowDataIteratorReaderFunction(new Configuration(), tableResource.table(), scanContext, rowType); + return new RowDataReaderFunction(new Configuration(), tableResource.table(), scanContext, rowType); } @Override From 5b3f5fd7480be7b6bf7f6ce97c1ce2ca51d91127 Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Wed, 8 Sep 2021 11:46:38 -0700 Subject: [PATCH 04/14] sync up with uber branch --- .../flink/source/reader/ReaderFunctionTestBase.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java b/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java index 54abab75341d..b109d823733a 100644 --- a/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java +++ b/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java @@ -156,7 +156,7 @@ private List extractRecordsAndAssertPosition( @Test public void testNoCheckpointedPosition() throws IOException { final IcebergSourceSplit split = icebergSplit; - final CloseableIterator>> reader = readerFunction().apply(split); + final CloseableIterator>> reader = readerFunction().read(split); final RecordsWithSplitIds> batch0 = reader.next(); final List actual0 = extractRecordsAndAssertPosition(batch0, recordBatchList.get(0).size(), 0L, 0L); @@ -179,7 +179,7 @@ public void testCheckpointedPositionBeforeFirstFile() throws IOException { final IcebergSourceSplit split = new IcebergSourceSplit( icebergSplit.task(), new Position(0L, 0L)); - final CloseableIterator>> reader = readerFunction().apply(split); + final CloseableIterator>> reader = readerFunction().read(split); final RecordsWithSplitIds> batch0 = reader.next(); final List actual0 = extractRecordsAndAssertPosition(batch0, recordBatchList.get(0).size(), 0L, 0L); @@ -202,7 +202,7 @@ public void testCheckpointedPositionMiddleFirstFile() throws IOException { final IcebergSourceSplit split = new IcebergSourceSplit( icebergSplit.task(), new Position(0L, 1L)); - final CloseableIterator>> reader = readerFunction().apply(split); + final CloseableIterator>> reader = readerFunction().read(split); final RecordsWithSplitIds> batch0 = reader.next(); final List actual0 = extractRecordsAndAssertPosition(batch0, 1L, 0L, 1L); @@ -225,7 +225,7 @@ public void testCheckpointedPositionAfterFirstFile() throws IOException { final IcebergSourceSplit split = new IcebergSourceSplit( icebergSplit.task(), new Position(0L, 2L)); - final CloseableIterator>> reader = readerFunction().apply(split); + final CloseableIterator>> reader = readerFunction().read(split); final RecordsWithSplitIds> batch0 = reader.next(); final List actual1 = extractRecordsAndAssertPosition(batch0, recordBatchList.get(1).size(), 1L, 0L); @@ -243,7 +243,7 @@ public void testCheckpointedPositionBeforeSecondFile() throws IOException { final IcebergSourceSplit split = new IcebergSourceSplit( icebergSplit.task(), new Position(1L, 0L)); - final CloseableIterator>> reader = readerFunction().apply(split); + final CloseableIterator>> reader = readerFunction().read(split); final RecordsWithSplitIds> batch1 = reader.next(); final List actual1 = extractRecordsAndAssertPosition(batch1, recordBatchList.get(1).size(), 1L, 0L); @@ -261,7 +261,7 @@ public void testCheckpointedPositionMidSecondFile() throws IOException { final IcebergSourceSplit split = new IcebergSourceSplit( icebergSplit.task(), new Position(1L, 1L)); - final CloseableIterator>> reader = readerFunction().apply(split); + final CloseableIterator>> reader = readerFunction().read(split); final RecordsWithSplitIds> batch1 = reader.next(); final List actual1 = extractRecordsAndAssertPosition(batch1, 1L, 1L, 1L); From ce4cfad6d75ca34c8e5b6134f2b936c059c0fdb5 Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Wed, 8 Sep 2021 13:02:26 -0700 Subject: [PATCH 05/14] make ReaderFunction a functional interface --- .../source/reader/DataIteratorReaderFunction.java | 2 +- .../source/reader/IcebergSourceSplitReader.java | 2 +- .../iceberg/flink/source/reader/ReaderFunction.java | 7 ++++--- .../flink/source/reader/ReaderFunctionTestBase.java | 12 ++++++------ 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java index 364879401316..d587ed8ba83e 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java @@ -35,7 +35,7 @@ public DataIteratorReaderFunction(DataIteratorBatcher batcher) { public abstract DataIterator createDataIterator(IcebergSourceSplit split); @Override - public CloseableIterator>> read(IcebergSourceSplit split) { + public CloseableIterator>> apply(IcebergSourceSplit split) { DataIterator inputIterator = createDataIterator(split); if (split.position() != null) { inputIterator.seek(split.position()); diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java index 359a352864d1..023feb9fe7aa 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java @@ -101,7 +101,7 @@ private void checkSplitOrStartNext() throws IOException { throw new IOException("No split remaining"); } currentSplitId = nextSplit.splitId(); - currentReader = readerFunction.read(nextSplit); + currentReader = readerFunction.apply(nextSplit); } private FileRecords finishSplit() throws IOException { diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java index c3a32bc0e05d..d8add3d257cc 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java @@ -20,12 +20,13 @@ package org.apache.iceberg.flink.source.reader; import java.io.Serializable; +import java.util.function.Function; import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; import org.apache.flink.connector.file.src.util.RecordAndPosition; import org.apache.iceberg.flink.source.split.IcebergSourceSplit; import org.apache.iceberg.io.CloseableIterator; -public interface ReaderFunction extends Serializable { - CloseableIterator>> read(IcebergSourceSplit split); +@FunctionalInterface +public interface ReaderFunction extends Serializable, Function>>> { } - diff --git a/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java b/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java index b109d823733a..54abab75341d 100644 --- a/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java +++ b/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java @@ -156,7 +156,7 @@ private List extractRecordsAndAssertPosition( @Test public void testNoCheckpointedPosition() throws IOException { final IcebergSourceSplit split = icebergSplit; - final CloseableIterator>> reader = readerFunction().read(split); + final CloseableIterator>> reader = readerFunction().apply(split); final RecordsWithSplitIds> batch0 = reader.next(); final List actual0 = extractRecordsAndAssertPosition(batch0, recordBatchList.get(0).size(), 0L, 0L); @@ -179,7 +179,7 @@ public void testCheckpointedPositionBeforeFirstFile() throws IOException { final IcebergSourceSplit split = new IcebergSourceSplit( icebergSplit.task(), new Position(0L, 0L)); - final CloseableIterator>> reader = readerFunction().read(split); + final CloseableIterator>> reader = readerFunction().apply(split); final RecordsWithSplitIds> batch0 = reader.next(); final List actual0 = extractRecordsAndAssertPosition(batch0, recordBatchList.get(0).size(), 0L, 0L); @@ -202,7 +202,7 @@ public void testCheckpointedPositionMiddleFirstFile() throws IOException { final IcebergSourceSplit split = new IcebergSourceSplit( icebergSplit.task(), new Position(0L, 1L)); - final CloseableIterator>> reader = readerFunction().read(split); + final CloseableIterator>> reader = readerFunction().apply(split); final RecordsWithSplitIds> batch0 = reader.next(); final List actual0 = extractRecordsAndAssertPosition(batch0, 1L, 0L, 1L); @@ -225,7 +225,7 @@ public void testCheckpointedPositionAfterFirstFile() throws IOException { final IcebergSourceSplit split = new IcebergSourceSplit( icebergSplit.task(), new Position(0L, 2L)); - final CloseableIterator>> reader = readerFunction().read(split); + final CloseableIterator>> reader = readerFunction().apply(split); final RecordsWithSplitIds> batch0 = reader.next(); final List actual1 = extractRecordsAndAssertPosition(batch0, recordBatchList.get(1).size(), 1L, 0L); @@ -243,7 +243,7 @@ public void testCheckpointedPositionBeforeSecondFile() throws IOException { final IcebergSourceSplit split = new IcebergSourceSplit( icebergSplit.task(), new Position(1L, 0L)); - final CloseableIterator>> reader = readerFunction().read(split); + final CloseableIterator>> reader = readerFunction().apply(split); final RecordsWithSplitIds> batch1 = reader.next(); final List actual1 = extractRecordsAndAssertPosition(batch1, recordBatchList.get(1).size(), 1L, 0L); @@ -261,7 +261,7 @@ public void testCheckpointedPositionMidSecondFile() throws IOException { final IcebergSourceSplit split = new IcebergSourceSplit( icebergSplit.task(), new Position(1L, 1L)); - final CloseableIterator>> reader = readerFunction().read(split); + final CloseableIterator>> reader = readerFunction().apply(split); final RecordsWithSplitIds> batch1 = reader.next(); final List actual1 = extractRecordsAndAssertPosition(batch1, 1L, 1L, 1L); From c2b7eea3a55a7cb4b9d9de503ed66a2c527e38db Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Mon, 20 Sep 2021 09:22:16 -0700 Subject: [PATCH 06/14] support includeColumnStats in ScanContext. This is needed for event time aligned assigner for FLIP-27 source. --- .../flink/source/FlinkSplitGenerator.java | 4 ++++ .../iceberg/flink/source/ScanContext.java | 24 ++++++++++++++++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitGenerator.java b/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitGenerator.java index 1ba396c187e9..be1a9138d60c 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitGenerator.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitGenerator.java @@ -65,6 +65,10 @@ static CloseableIterable planTasks(Table table, ScanContext co .caseSensitive(context.caseSensitive()) .project(context.project()); + if (context.includeColumnStats()) { + scan = scan.includeColumnStats(); + } + if (context.snapshotId() != null) { scan = scan.useSnapshot(context.snapshotId()); } diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java b/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java index b0336d70f179..73a31930cd35 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java @@ -68,6 +68,9 @@ public class ScanContext implements Serializable { private static final ConfigOption MONITOR_INTERVAL = ConfigOptions.key("monitor-interval").durationType().defaultValue(Duration.ofSeconds(10)); + private static final ConfigOption INCLUDE_COLUMN_STATS = + ConfigOptions.key("include-column-stats").booleanType().defaultValue(false); + private final boolean caseSensitive; private final Long snapshotId; private final Long startSnapshotId; @@ -83,11 +86,12 @@ public class ScanContext implements Serializable { private final Schema schema; private final List filters; private final long limit; + private final boolean includeColumnStats; private ScanContext(boolean caseSensitive, Long snapshotId, Long startSnapshotId, Long endSnapshotId, Long asOfTimestamp, Long splitSize, Integer splitLookback, Long splitOpenFileCost, boolean isStreaming, Duration monitorInterval, String nameMapping, - Schema schema, List filters, long limit) { + Schema schema, List filters, long limit, boolean includeColumnStats) { this.caseSensitive = caseSensitive; this.snapshotId = snapshotId; this.startSnapshotId = startSnapshotId; @@ -103,6 +107,7 @@ private ScanContext(boolean caseSensitive, Long snapshotId, Long startSnapshotId this.schema = schema; this.filters = filters; this.limit = limit; + this.includeColumnStats = includeColumnStats; } public boolean caseSensitive() { @@ -161,6 +166,10 @@ public long limit() { return limit; } + public boolean includeColumnStats() { + return includeColumnStats; + } + public ScanContext copyWithAppendsBetween(long newStartSnapshotId, long newEndSnapshotId) { return ScanContext.builder() .caseSensitive(caseSensitive) @@ -177,6 +186,7 @@ public ScanContext copyWithAppendsBetween(long newStartSnapshotId, long newEndSn .project(schema) .filters(filters) .limit(limit) + .includeColumnStats(includeColumnStats) .build(); } @@ -196,6 +206,7 @@ public ScanContext copyWithSnapshotId(long newSnapshotId) { .project(schema) .filters(filters) .limit(limit) + .includeColumnStats(includeColumnStats) .build(); } @@ -218,6 +229,7 @@ public static class Builder { private Schema projectedSchema; private List filters; private long limit = -1L; + private boolean includeColumnStats = INCLUDE_COLUMN_STATS.defaultValue(); private Builder() { } @@ -292,6 +304,11 @@ public Builder limit(long newLimit) { return this; } + public Builder includeColumnStats(boolean newIncludeColumnStats) { + this.includeColumnStats = newIncludeColumnStats; + return this; + } + public Builder fromProperties(Map properties) { Configuration config = new Configuration(); properties.forEach(config::setString); @@ -306,14 +323,15 @@ public Builder fromProperties(Map properties) { .splitOpenFileCost(config.get(SPLIT_FILE_OPEN_COST)) .streaming(config.get(STREAMING)) .monitorInterval(config.get(MONITOR_INTERVAL)) - .nameMapping(properties.get(DEFAULT_NAME_MAPPING)); + .nameMapping(properties.get(DEFAULT_NAME_MAPPING)) + .includeColumnStats(config.get(INCLUDE_COLUMN_STATS)); } public ScanContext build() { return new ScanContext(caseSensitive, snapshotId, startSnapshotId, endSnapshotId, asOfTimestamp, splitSize, splitLookback, splitOpenFileCost, isStreaming, monitorInterval, nameMapping, projectedSchema, - filters, limit); + filters, limit, includeColumnStats); } } } From 93e3e498b76b2aff888957f724d1e431b5de8de9 Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Mon, 20 Sep 2021 11:12:02 -0700 Subject: [PATCH 07/14] rename FlinkSplitGenerator to FlinkSplitPlanner. also marked some public classes as @Internal --- .../apache/iceberg/flink/source/FlinkInputFormat.java | 2 +- ...linkSplitGenerator.java => FlinkSplitPlanner.java} | 11 ++++++++--- .../org/apache/iceberg/flink/source/Position.java | 2 ++ .../flink/source/StreamingMonitorFunction.java | 2 +- .../iceberg/flink/source/reader/FileRecords.java | 2 ++ .../source/reader/IcebergSourceReaderMetrics.java | 2 ++ .../flink/source/split/IcebergSourceSplit.java | 2 ++ .../source/split/IcebergSourceSplitSerializer.java | 2 ++ .../flink/source/TestStreamingReaderOperator.java | 2 +- .../flink/source/reader/ReaderFunctionTestBase.java | 4 ++-- .../source/reader/TestIcebergSourceSplitReader.java | 4 ++-- .../iceberg/flink/source/split/SplitHelpers.java | 4 ++-- 12 files changed, 27 insertions(+), 12 deletions(-) rename flink/src/main/java/org/apache/iceberg/flink/source/{FlinkSplitGenerator.java => FlinkSplitPlanner.java} (93%) diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java b/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java index 8b757ac31606..a4cbab5c37e4 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/FlinkInputFormat.java @@ -77,7 +77,7 @@ public FlinkInputSplit[] createInputSplits(int minNumSplits) throws IOException tableLoader.open(); try (TableLoader loader = tableLoader) { Table table = loader.loadTable(); - return FlinkSplitGenerator.createInputSplits(table, context); + return FlinkSplitPlanner.planInputSplits(table, context); } } diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitGenerator.java b/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java similarity index 93% rename from flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitGenerator.java rename to flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java index be1a9138d60c..ef0f71c05a67 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitGenerator.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/FlinkSplitPlanner.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.io.UncheckedIOException; import java.util.List; +import org.apache.flink.annotation.Internal; import org.apache.iceberg.CombinedScanTask; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; @@ -31,11 +32,12 @@ import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.collect.Lists; -public class FlinkSplitGenerator { - private FlinkSplitGenerator() { +@Internal +public class FlinkSplitPlanner { + private FlinkSplitPlanner() { } - static FlinkInputSplit[] createInputSplits(Table table, ScanContext context) { + static FlinkInputSplit[] planInputSplits(Table table, ScanContext context) { try (CloseableIterable tasksIterable = planTasks(table, context)) { List tasks = Lists.newArrayList(tasksIterable); FlinkInputSplit[] splits = new FlinkInputSplit[tasks.size()]; @@ -48,6 +50,9 @@ static FlinkInputSplit[] createInputSplits(Table table, ScanContext context) { } } + /** + * This returns splits for the FLIP-27 source + */ public static List planIcebergSourceSplits( Table table, ScanContext context) { try (CloseableIterable tasksIterable = planTasks(table, context)) { diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/Position.java b/flink/src/main/java/org/apache/iceberg/flink/source/Position.java index 9ac2c89e0972..28c4c530fc2e 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/Position.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/Position.java @@ -21,6 +21,7 @@ import java.io.Serializable; import java.util.Objects; +import org.apache.flink.annotation.Internal; import org.apache.iceberg.CombinedScanTask; import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; @@ -31,6 +32,7 @@ *
  • record offset within a file
  • * */ +@Internal public class Position implements Serializable { private static final long serialVersionUID = 1L; diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java b/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java index b31426a099f0..7913a18bde9c 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/StreamingMonitorFunction.java @@ -140,7 +140,7 @@ private void monitorAndForwardSplits() { newScanContext = scanContext.copyWithAppendsBetween(lastSnapshotId, snapshotId); } - FlinkInputSplit[] splits = FlinkSplitGenerator.createInputSplits(table, newScanContext); + FlinkInputSplit[] splits = FlinkSplitPlanner.planInputSplits(table, newScanContext); for (FlinkInputSplit split : splits) { sourceContext.collect(split); } diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/FileRecords.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/FileRecords.java index 256660dd0a7a..9ae527b68df9 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/FileRecords.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/FileRecords.java @@ -23,6 +23,7 @@ import java.util.Collections; import java.util.Set; import javax.annotation.Nullable; +import org.apache.flink.annotation.Internal; import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; import org.apache.flink.connector.file.src.util.RecordAndPosition; import org.apache.iceberg.io.CloseableIterator; @@ -30,6 +31,7 @@ /** * A batch of recrods for one split */ +@Internal public class FileRecords implements RecordsWithSplitIds> { @Nullable diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java index cb81d54bab90..9e9d419bd5f0 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceReaderMetrics.java @@ -20,9 +20,11 @@ package org.apache.iceberg.flink.source.reader; import java.util.concurrent.atomic.AtomicLong; +import org.apache.flink.annotation.Internal; import org.apache.flink.metrics.Counter; import org.apache.flink.metrics.MetricGroup; +@Internal public class IcebergSourceReaderMetrics { private final AtomicLong numRecordsOut; private final AtomicLong assignedSplits; diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java b/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java index d8d13090cca6..d429f5813bfc 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java @@ -23,6 +23,7 @@ import java.util.Collection; import java.util.stream.Collectors; import javax.annotation.Nullable; +import org.apache.flink.annotation.Internal; import org.apache.flink.api.connector.source.SourceSplit; import org.apache.iceberg.CombinedScanTask; import org.apache.iceberg.FileScanTask; @@ -31,6 +32,7 @@ import org.apache.iceberg.relocated.com.google.common.base.Objects; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +@Internal public class IcebergSourceSplit implements SourceSplit, Serializable { private final CombinedScanTask task; /** diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java b/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java index 86aa82fb1656..a208e94f9f14 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java @@ -20,6 +20,7 @@ package org.apache.iceberg.flink.source.split; import java.io.IOException; +import org.apache.flink.annotation.Internal; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.core.io.SimpleVersionedSerializer; import org.apache.flink.util.InstantiationUtil; @@ -28,6 +29,7 @@ * TODO: use Java serialization for now. * will switch to more stable serializer from issue-1698. */ +@Internal public class IcebergSourceSplitSerializer implements SimpleVersionedSerializer { public static final IcebergSourceSplitSerializer INSTANCE = new IcebergSourceSplitSerializer(); private static final int VERSION = 1; diff --git a/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java b/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java index 0f5d6e1e4975..353fd8dfda0d 100644 --- a/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java +++ b/flink/src/test/java/org/apache/iceberg/flink/source/TestStreamingReaderOperator.java @@ -254,7 +254,7 @@ private List generateSplits() { .build(); } - Collections.addAll(inputSplits, FlinkSplitGenerator.createInputSplits(table, scanContext)); + Collections.addAll(inputSplits, FlinkSplitPlanner.planInputSplits(table, scanContext)); } return inputSplits; diff --git a/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java b/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java index 54abab75341d..9ee66f3374cb 100644 --- a/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java +++ b/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java @@ -36,7 +36,7 @@ import org.apache.iceberg.data.Record; import org.apache.iceberg.flink.HadoopTableResource; import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.source.FlinkSplitGenerator; +import org.apache.iceberg.flink.source.FlinkSplitPlanner; import org.apache.iceberg.flink.source.Position; import org.apache.iceberg.flink.source.ScanContext; import org.apache.iceberg.flink.source.split.IcebergSourceSplit; @@ -103,7 +103,7 @@ public void before() throws IOException { dataAppender.appendToTable(dataFile); } - final List splits = FlinkSplitGenerator + final List splits = FlinkSplitPlanner .planIcebergSourceSplits(tableResource.table(), scanContext); Assert.assertEquals(1, splits.size()); Assert.assertEquals(3, splits.get(0).task().files().size()); diff --git a/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceSplitReader.java b/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceSplitReader.java index 9662a04dbd49..6c37b55569cf 100644 --- a/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceSplitReader.java +++ b/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceSplitReader.java @@ -41,7 +41,7 @@ import org.apache.iceberg.flink.HadoopTableResource; import org.apache.iceberg.flink.TestFixtures; import org.apache.iceberg.flink.TestHelpers; -import org.apache.iceberg.flink.source.FlinkSplitGenerator; +import org.apache.iceberg.flink.source.FlinkSplitPlanner; import org.apache.iceberg.flink.source.ScanContext; import org.apache.iceberg.flink.source.split.IcebergSourceSplit; import org.apache.iceberg.relocated.com.google.common.collect.Sets; @@ -90,7 +90,7 @@ public static void beforeClass() throws IOException { dataAppender.appendToTable(dataFile); } - final List splits = FlinkSplitGenerator + final List splits = FlinkSplitPlanner .planIcebergSourceSplits(tableResource.table(), scanContext); Assert.assertEquals(1, splits.size()); Assert.assertEquals(3, splits.get(0).task().files().size()); diff --git a/flink/src/test/java/org/apache/iceberg/flink/source/split/SplitHelpers.java b/flink/src/test/java/org/apache/iceberg/flink/source/split/SplitHelpers.java index 91cf7c0526c6..c97111b6fd97 100644 --- a/flink/src/test/java/org/apache/iceberg/flink/source/split/SplitHelpers.java +++ b/flink/src/test/java/org/apache/iceberg/flink/source/split/SplitHelpers.java @@ -34,7 +34,7 @@ import org.apache.iceberg.data.RandomGenericData; import org.apache.iceberg.data.Record; import org.apache.iceberg.flink.TestFixtures; -import org.apache.iceberg.flink.source.FlinkSplitGenerator; +import org.apache.iceberg.flink.source.FlinkSplitPlanner; import org.apache.iceberg.flink.source.ScanContext; import org.apache.iceberg.hadoop.HadoopCatalog; import org.apache.iceberg.relocated.com.google.common.collect.Lists; @@ -79,7 +79,7 @@ public static List createFileSplits( } final ScanContext scanContext = ScanContext.builder().build(); - final List splits = FlinkSplitGenerator.planIcebergSourceSplits(table, scanContext); + final List splits = FlinkSplitPlanner.planIcebergSourceSplits(table, scanContext); return splits.stream() .flatMap(split -> { List> filesList = Lists.partition(new ArrayList<>(split.task().files()), filesPerSplit); From 887436ff788bb05d9ec2aa394152506a90b7b472 Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Mon, 20 Sep 2021 11:27:36 -0700 Subject: [PATCH 08/14] rename fetch-batch-size to fetch-record-batch-size to be more clear --- .../java/org/apache/iceberg/flink/FlinkConfigOptions.java | 6 +++--- .../flink/source/reader/ArrayPoolDataIteratorBatcher.java | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java b/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java index d3fb0268fa4f..b9e7d0204652 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java +++ b/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java @@ -41,9 +41,9 @@ private FlinkConfigOptions() { .defaultValue(100) .withDescription("Sets max infer parallelism for source operator."); - public static final ConfigOption SOURCE_READER_FETCH_BATCH_SIZE = ConfigOptions - .key("source.iceberg.reader.fetch-batch-size") + public static final ConfigOption SOURCE_READER_FETCH_RECORD_BATCH_SIZE = ConfigOptions + .key("source.iceberg.reader.fetch-record-batch-size") .intType() .defaultValue(2048) - .withDescription("The target batch size for split reader fetch."); + .withDescription("The target record batch size for split reader fetch."); } diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java index fc03fc4683fb..52e7a6be7366 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java @@ -56,7 +56,7 @@ private class ArrayPoolBatchIterator implements CloseableIterator inputIterator) { this.splitId = splitId; this.inputIterator = inputIterator; - this.batchSize = config.getInteger(FlinkConfigOptions.SOURCE_READER_FETCH_BATCH_SIZE); + this.batchSize = config.getInteger(FlinkConfigOptions.SOURCE_READER_FETCH_RECORD_BATCH_SIZE); this.pool = createPoolOfBatches(config.getInteger(SourceReaderOptions.ELEMENT_QUEUE_CAPACITY)); } From 75b29cbf1473e73c5f1c487191b36700514cfe51 Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Sun, 26 Sep 2021 21:21:07 -0700 Subject: [PATCH 09/14] Address Thomas' comments --- .../reader/ArrayPoolDataIteratorBatcher.java | 2 +- .../flink/source/reader/DataIteratorBatcher.java | 5 +++++ .../source/reader/DataIteratorReaderFunction.java | 3 +++ .../source/reader/IcebergSourceSplitReader.java | 4 ++-- .../source/reader/RecyclableArrayIterator.java | 11 +++++------ .../reader/{FileRecords.java => SplitRecords.java} | 14 +++++++------- .../flink/source/split/IcebergSourceSplit.java | 4 ++-- .../source/split/IcebergSourceSplitSerializer.java | 3 ++- 8 files changed, 27 insertions(+), 19 deletions(-) rename flink/src/main/java/org/apache/iceberg/flink/source/reader/{FileRecords.java => SplitRecords.java} (86%) diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java index 52e7a6be7366..c4763a4e0f31 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java @@ -83,7 +83,7 @@ public RecordsWithSplitIds> next() { return null; } else { Position position = inputIterator.position(); - return FileRecords.forRecords(splitId, new RecyclableArrayIterator<>( + return SplitRecords.forRecords(splitId, new RecyclableArrayIterator<>( pool.recycler(), batch, num, position.fileOffset(), position.recordOffset() - num)); } } diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java index d2784f253279..0f0d16a6ab92 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java @@ -21,10 +21,15 @@ import java.io.Serializable; import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; +import org.apache.flink.connector.base.source.reader.splitreader.SplitReader; import org.apache.flink.connector.file.src.util.RecordAndPosition; import org.apache.iceberg.flink.source.DataIterator; import org.apache.iceberg.io.CloseableIterator; +/** + * Batcher converts iterator of T into iterator of batched {@code RecordsWithSplitIds>}, + * which is what FLIP-27's {@link SplitReader#fetch()} returns. + */ @FunctionalInterface public interface DataIteratorBatcher extends Serializable { CloseableIterator>> apply(String splitId, DataIterator inputIterator); diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java index d587ed8ba83e..5d32a359ff51 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java @@ -25,6 +25,9 @@ import org.apache.iceberg.flink.source.split.IcebergSourceSplit; import org.apache.iceberg.io.CloseableIterator; +/** + * A {@link ReaderFunction} implementation that uses {@link DataIterator}. + */ public abstract class DataIteratorReaderFunction implements ReaderFunction { private final DataIteratorBatcher batcher; diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java index 023feb9fe7aa..3003100968ed 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java @@ -104,12 +104,12 @@ private void checkSplitOrStartNext() throws IOException { currentReader = readerFunction.apply(nextSplit); } - private FileRecords finishSplit() throws IOException { + private SplitRecords finishSplit() throws IOException { if (currentReader != null) { currentReader.close(); currentReader = null; } - final FileRecords finishRecords = FileRecords.finishedSplit(currentSplitId); + final SplitRecords finishRecords = SplitRecords.finishedSplit(currentSplitId); LOG.debug("Split reader {} finished split: {}", indexOfSubtask, currentSplitId); currentSplitId = null; metrics.incrementFinishedSplits(1L); diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecyclableArrayIterator.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecyclableArrayIterator.java index e0bd19ed3f76..9ffe6654df31 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecyclableArrayIterator.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecyclableArrayIterator.java @@ -30,6 +30,11 @@ /** * Similar to the {@link ArrayResultIterator}. * Main difference is the records array can be recycled back to a pool. + * + * Each record's {@link RecordAndPosition} will have the same fileOffset (for {@link RecordAndPosition#getOffset()}. + * The first returned record will have a records-to-skip count of {@code recordOffset + 1}, following + * the contract that each record needs to point to the position AFTER itself + * (because a checkpoint taken after the record was emitted needs to resume from after that record). */ final class RecyclableArrayIterator implements CloseableIterator> { private final Pool.Recycler recycler; @@ -43,12 +48,6 @@ final class RecyclableArrayIterator implements CloseableIterator recycler, final E[] newRecords, final int newNum, final long fileOffset, final long recordOffset) { diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/FileRecords.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitRecords.java similarity index 86% rename from flink/src/main/java/org/apache/iceberg/flink/source/reader/FileRecords.java rename to flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitRecords.java index 9ae527b68df9..22e3de1a4eb0 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/FileRecords.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitRecords.java @@ -29,10 +29,10 @@ import org.apache.iceberg.io.CloseableIterator; /** - * A batch of recrods for one split + * A batch of records for one split */ @Internal -public class FileRecords implements RecordsWithSplitIds> { +public class SplitRecords implements RecordsWithSplitIds> { @Nullable private final CloseableIterator> recordsForSplit; @@ -43,7 +43,7 @@ public class FileRecords implements RecordsWithSplitIds> @Nullable private CloseableIterator> recordsForSplitCurrent; - private FileRecords( + private SplitRecords( @Nullable String splitId, @Nullable CloseableIterator> recordsForSplit, Set finishedSplits) { @@ -92,12 +92,12 @@ public Set finishedSplits() { return finishedSplits; } - public static FileRecords forRecords( + public static SplitRecords forRecords( final String splitId, final CloseableIterator> recordsForSplit) { - return new FileRecords<>(splitId, recordsForSplit, Collections.emptySet()); + return new SplitRecords<>(splitId, recordsForSplit, Collections.emptySet()); } - public static FileRecords finishedSplit(String splitId) { - return new FileRecords<>(null, null, Collections.singleton(splitId)); + public static SplitRecords finishedSplit(String splitId) { + return new SplitRecords<>(null, null, Collections.singleton(splitId)); } } diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java b/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java index d429f5813bfc..dc1e77352b07 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java @@ -70,11 +70,11 @@ public Position position() { return position; } - public byte[] serializedFormCache() { + byte[] serializedFormCache() { return serializedFormCache; } - public void serializedFormCache(byte[] cachedBytes) { + void serializedFormCache(byte[] cachedBytes) { this.serializedFormCache = cachedBytes; } diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java b/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java index a208e94f9f14..9bb65497ff37 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplitSerializer.java @@ -27,7 +27,8 @@ /** * TODO: use Java serialization for now. - * will switch to more stable serializer from issue-1698. + * Will switch to more stable serializer from + * issue-1698. */ @Internal public class IcebergSourceSplitSerializer implements SimpleVersionedSerializer { From db48cd717b873820b499fddef61ecd9c8dad05de Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Wed, 20 Oct 2021 11:26:18 -0700 Subject: [PATCH 10/14] Constrcut RowType internally inside RowDataReaderFunction from read schema. also make projected schema optional in ScanContext --- .../source/reader/RowDataReaderFunction.java | 18 +++++++++++++----- .../reader/TestIcebergSourceSplitReader.java | 5 +---- .../reader/TestRowDataReaderFunction.java | 2 +- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java index 907675543bd0..aa70097b871b 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RowDataReaderFunction.java @@ -21,8 +21,9 @@ import org.apache.flink.configuration.Configuration; import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.Schema; import org.apache.iceberg.Table; +import org.apache.iceberg.flink.FlinkSchemaUtil; import org.apache.iceberg.flink.source.DataIterator; import org.apache.iceberg.flink.source.RowDataFileScanTaskReader; import org.apache.iceberg.flink.source.ScanContext; @@ -31,15 +32,17 @@ public class RowDataReaderFunction extends DataIteratorReaderFunction { private final Table table; private final ScanContext scanContext; + private final Schema readSchema; public RowDataReaderFunction( Configuration config, Table table, - ScanContext scanContext, - RowType rowType) { - super(new ArrayPoolDataIteratorBatcher<>(config, new RowDataRecordFactory(rowType))); + ScanContext scanContext) { + super(new ArrayPoolDataIteratorBatcher<>(config, new RowDataRecordFactory( + FlinkSchemaUtil.convert(readSchema(table, scanContext))))); this.table = table; this.scanContext = scanContext; + this.readSchema = readSchema(table, scanContext); } @Override @@ -47,11 +50,16 @@ public DataIterator createDataIterator(IcebergSourceSplit split) { return new DataIterator<>( new RowDataFileScanTaskReader( table.schema(), - scanContext.project(), + readSchema, scanContext.nameMapping(), scanContext.caseSensitive()), split.task(), table.io(), table.encryption()); } + + private static Schema readSchema(Table table, ScanContext scanContext) { + return scanContext.project() == null ? table.schema() : scanContext.project(); + } + } diff --git a/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceSplitReader.java b/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceSplitReader.java index 6c37b55569cf..83d5642f3c00 100644 --- a/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceSplitReader.java +++ b/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceSplitReader.java @@ -30,14 +30,12 @@ import org.apache.flink.connector.testutils.source.reader.TestingReaderContext; import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; import org.apache.flink.table.data.RowData; -import org.apache.flink.table.types.logical.RowType; import org.apache.flink.types.Row; import org.apache.iceberg.DataFile; import org.apache.iceberg.FileFormat; import org.apache.iceberg.data.GenericAppenderHelper; import org.apache.iceberg.data.RandomGenericData; import org.apache.iceberg.data.Record; -import org.apache.iceberg.flink.FlinkSchemaUtil; import org.apache.iceberg.flink.HadoopTableResource; import org.apache.iceberg.flink.TestFixtures; import org.apache.iceberg.flink.TestHelpers; @@ -104,9 +102,8 @@ public void before() { private IcebergSourceSplitReader createSplitReader() { final Configuration config = new Configuration(); - RowType rowType = FlinkSchemaUtil.convert(tableResource.table().schema()); return new IcebergSourceSplitReader( - new RowDataReaderFunction(config, tableResource.table(), scanContext, rowType), + new RowDataReaderFunction(config, tableResource.table(), scanContext), new TestingReaderContext(), new IcebergSourceReaderMetrics(new UnregisteredMetricsGroup())); } diff --git a/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java b/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java index 558bd2785552..c0f6c62cd033 100644 --- a/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java +++ b/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRowDataReaderFunction.java @@ -50,7 +50,7 @@ public TestRowDataReaderFunction(FileFormat fileFormat) { @Override protected ReaderFunction readerFunction() { - return new RowDataReaderFunction(new Configuration(), tableResource.table(), scanContext, rowType); + return new RowDataReaderFunction(new Configuration(), tableResource.table(), scanContext); } @Override From 819bed56f27fba3a712651993ff5551fc29cf035 Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Wed, 20 Oct 2021 11:45:24 -0700 Subject: [PATCH 11/14] reapply dep change after build.gradle refactoring from master branch --- flink-runtime/build.gradle | 79 ++++++++++++++++++++++++++++++++++++++ flink/build.gradle | 2 + 2 files changed, 81 insertions(+) create mode 100644 flink-runtime/build.gradle diff --git a/flink-runtime/build.gradle b/flink-runtime/build.gradle new file mode 100644 index 000000000000..805abc03dd0d --- /dev/null +++ b/flink-runtime/build.gradle @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +project(':iceberg-flink-runtime') { + apply plugin: 'com.github.johnrengelman.shadow' + + tasks.jar.dependsOn tasks.shadowJar + + configurations { + implementation { + exclude group: 'org.apache.flink' + // included in Flink + exclude group: 'org.slf4j' + exclude group: 'org.apache.commons' + exclude group: 'commons-pool' + exclude group: 'commons-codec' + exclude group: 'org.xerial.snappy' + exclude group: 'javax.xml.bind' + exclude group: 'javax.annotation' + } + } + + dependencies { + implementation project(':iceberg-flink') + implementation project(':iceberg-aws') + implementation(project(':iceberg-nessie')) { + exclude group: 'com.google.code.findbugs', module: 'jsr305' + } + + implementation "org.apache.flink:flink-connector-base" + } + + shadowJar { + configurations = [project.configurations.runtimeClasspath] + + zip64 true + + // include the LICENSE and NOTICE files for the shaded Jar + from(projectDir) { + include 'LICENSE' + include 'NOTICE' + } + + // Relocate dependencies to avoid conflicts + relocate 'org.apache.avro', 'org.apache.iceberg.shaded.org.apache.avro' + relocate 'org.apache.parquet', 'org.apache.iceberg.shaded.org.apache.parquet' + relocate 'com.google', 'org.apache.iceberg.shaded.com.google' + relocate 'com.fasterxml', 'org.apache.iceberg.shaded.com.fasterxml' + relocate 'com.github.benmanes', 'org.apache.iceberg.shaded.com.github.benmanes' + relocate 'org.checkerframework', 'org.apache.iceberg.shaded.org.checkerframework' + relocate 'shaded.parquet', 'org.apache.iceberg.shaded.org.apache.parquet.shaded' + relocate 'org.apache.orc', 'org.apache.iceberg.shaded.org.apache.orc' + relocate 'io.airlift', 'org.apache.iceberg.shaded.io.airlift' + relocate 'org.threeten.extra', 'org.apache.iceberg.shaded.org.threeten.extra' + + classifier null + } + + jar { + enabled = false + } +} + diff --git a/flink/build.gradle b/flink/build.gradle index bbf65177bc3f..936d5bec0e46 100644 --- a/flink/build.gradle +++ b/flink/build.gradle @@ -28,6 +28,7 @@ project(':iceberg-flink') { implementation project(':iceberg-parquet') implementation project(':iceberg-hive-metastore') + compileOnly "org.apache.flink:flink-connector-base" compileOnly "org.apache.flink:flink-streaming-java_2.12" compileOnly "org.apache.flink:flink-streaming-java_2.12::tests" compileOnly "org.apache.flink:flink-table-api-java-bridge_2.12" @@ -56,6 +57,7 @@ project(':iceberg-flink') { exclude group: 'org.apache.hive', module: 'hive-storage-api' } + testImplementation "org.apache.flink:flink-connector-test-utils" testImplementation "org.apache.flink:flink-core" testImplementation "org.apache.flink:flink-runtime_2.12" testImplementation "org.apache.flink:flink-table-planner-blink_2.12" From 7e514710a8e131282f6d1a8b7bcf11d4e45620c7 Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Fri, 29 Oct 2021 10:04:45 -0700 Subject: [PATCH 12/14] address review comments for split reader --- flink-runtime/build.gradle | 2 ++ .../main/java/org/apache/iceberg/flink/FlinkConfigOptions.java | 2 +- .../java/org/apache/iceberg/flink/source/DataIterator.java | 3 +++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/flink-runtime/build.gradle b/flink-runtime/build.gradle index 805abc03dd0d..e93fa5865a73 100644 --- a/flink-runtime/build.gradle +++ b/flink-runtime/build.gradle @@ -43,6 +43,8 @@ project(':iceberg-flink-runtime') { exclude group: 'com.google.code.findbugs', module: 'jsr305' } + // flink-connector-base is not part of Flink runtime. + // Hence, iceberg-flink-runtime should include it a transitive dependency. implementation "org.apache.flink:flink-connector-base" } diff --git a/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java b/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java index b9e7d0204652..ebad99951b77 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java +++ b/flink/src/main/java/org/apache/iceberg/flink/FlinkConfigOptions.java @@ -45,5 +45,5 @@ private FlinkConfigOptions() { .key("source.iceberg.reader.fetch-record-batch-size") .intType() .defaultValue(2048) - .withDescription("The target record batch size for split reader fetch."); + .withDescription("The target number of records for Iceberg reader fetch batch."); } diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java b/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java index 67b719aba5cf..1235ebd8502e 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java @@ -38,7 +38,9 @@ */ @Internal public class DataIterator implements CloseableIterator { + private final FileScanTaskReader fileScanTaskReader; + private final InputFilesDecryptor inputFilesDecryptor; private final CombinedScanTask combinedTask; @@ -49,6 +51,7 @@ public class DataIterator implements CloseableIterator { public DataIterator(FileScanTaskReader fileScanTaskReader, CombinedScanTask task, FileIO io, EncryptionManager encryption) { this.fileScanTaskReader = fileScanTaskReader; + this.inputFilesDecryptor = new InputFilesDecryptor(task, io, encryption); this.combinedTask = task; From 71eaa71a1e282bcb7a8f7287f04f599297af1385 Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Fri, 29 Oct 2021 10:20:25 -0700 Subject: [PATCH 13/14] fix comment --- flink-runtime/build.gradle | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flink-runtime/build.gradle b/flink-runtime/build.gradle index e93fa5865a73..c8a0a8e87487 100644 --- a/flink-runtime/build.gradle +++ b/flink-runtime/build.gradle @@ -43,8 +43,8 @@ project(':iceberg-flink-runtime') { exclude group: 'com.google.code.findbugs', module: 'jsr305' } - // flink-connector-base is not part of Flink runtime. - // Hence, iceberg-flink-runtime should include it a transitive dependency. + // flink-connector-base is not part of Flink runtime. Hence, + // iceberg-flink-runtime should include it as a transitive dependency. implementation "org.apache.flink:flink-connector-base" } From d7ec63d4fc6726e7454e7fd0b3f94f3aea68f682 Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Sun, 31 Oct 2021 21:33:11 -0700 Subject: [PATCH 14/14] address Ryan's review comments for split reader --- .../iceberg/flink/source/DataIterator.java | 20 ++--- .../apache/iceberg/flink/source/Position.java | 8 +- .../source/RowDataFileScanTaskReader.java | 1 + .../reader/ArrayPoolDataIteratorBatcher.java | 13 ++- .../source/reader/DataIteratorBatcher.java | 5 +- .../reader/DataIteratorReaderFunction.java | 3 +- .../reader/IcebergSourceSplitReader.java | 1 - .../reader/MutableRecordAndPosition.junk | 51 +++++++++++ .../flink/source/reader/ReaderFunction.java | 1 - .../source/reader/RecordAndPosition.java | 87 +++++++++++++++++++ .../reader/RecyclableArrayIterator.java | 17 ++-- .../flink/source/reader/SplitRecords.java | 1 - .../source/split/IcebergSourceSplit.java | 6 +- .../source/reader/ReaderFunctionTestBase.java | 55 ++++++------ .../reader/TestIcebergSourceSplitReader.java | 33 ++++--- .../reader/TestRecyclableArrayIterator.java | 13 ++- 16 files changed, 225 insertions(+), 90 deletions(-) create mode 100644 flink/src/main/java/org/apache/iceberg/flink/source/reader/MutableRecordAndPosition.junk create mode 100644 flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java b/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java index 1235ebd8502e..29472ccb4341 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/DataIterator.java @@ -43,10 +43,10 @@ public class DataIterator implements CloseableIterator { private final InputFilesDecryptor inputFilesDecryptor; private final CombinedScanTask combinedTask; + private final Position position; - private Iterator tasks; + private Iterator fileTasksIterator; private CloseableIterator currentIterator; - private Position position; public DataIterator(FileScanTaskReader fileScanTaskReader, CombinedScanTask task, FileIO io, EncryptionManager encryption) { @@ -54,12 +54,12 @@ public DataIterator(FileScanTaskReader fileScanTaskReader, CombinedScanTask t this.inputFilesDecryptor = new InputFilesDecryptor(task, io, encryption); this.combinedTask = task; - - this.tasks = task.files().iterator(); - this.currentIterator = CloseableIterator.empty(); // fileOffset starts at -1 because we started // from an empty iterator that is not from the split files. - this.position = new Position(-1L, 0L); + this.position = new Position(-1, 0L); + + this.fileTasksIterator = task.files().iterator(); + this.currentIterator = CloseableIterator.empty(); } public void seek(Position startingPosition) { @@ -68,7 +68,7 @@ public void seek(Position startingPosition) { "Checkpointed file offset is %d, while CombinedScanTask has %d files", startingPosition.fileOffset(), combinedTask.files().size()); for (long i = 0L; i < startingPosition.fileOffset(); ++i) { - tasks.next(); + fileTasksIterator.next(); } updateCurrentIterator(); // skip records within the file @@ -106,9 +106,9 @@ public boolean isCurrentIteratorDone() { */ private void updateCurrentIterator() { try { - while (!currentIterator.hasNext() && tasks.hasNext()) { + while (!currentIterator.hasNext() && fileTasksIterator.hasNext()) { currentIterator.close(); - currentIterator = openTaskIterator(tasks.next()); + currentIterator = openTaskIterator(fileTasksIterator.next()); position.advanceFile(); } } catch (IOException e) { @@ -124,7 +124,7 @@ private CloseableIterator openTaskIterator(FileScanTask scanTask) { public void close() throws IOException { // close the current iterator currentIterator.close(); - tasks = null; + fileTasksIterator = null; } public Position position() { diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/Position.java b/flink/src/main/java/org/apache/iceberg/flink/source/Position.java index 28c4c530fc2e..0e7acecfac7e 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/Position.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/Position.java @@ -37,10 +37,10 @@ public class Position implements Serializable { private static final long serialVersionUID = 1L; - private long fileOffset; + private int fileOffset; private long recordOffset; - public Position(long fileOffset, long recordOffset) { + public Position(int fileOffset, long recordOffset) { this.fileOffset = fileOffset; this.recordOffset = recordOffset; } @@ -54,12 +54,12 @@ void advanceRecord() { this.recordOffset += 1L; } - public void update(long newFileOffset, long newRecordOffset) { + public void update(int newFileOffset, long newRecordOffset) { this.fileOffset = newFileOffset; this.recordOffset = newRecordOffset; } - public long fileOffset() { + public int fileOffset() { return fileOffset; } diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java b/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java index dbe8a747f96b..fbdb7bf3cc02 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java @@ -48,6 +48,7 @@ @Internal public class RowDataFileScanTaskReader implements FileScanTaskReader { + private final Schema tableSchema; private final Schema projectedSchema; private final String nameMapping; diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java index c4763a4e0f31..19a6f5f437a6 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/ArrayPoolDataIteratorBatcher.java @@ -24,13 +24,18 @@ import org.apache.flink.configuration.Configuration; import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; import org.apache.flink.connector.base.source.reader.SourceReaderOptions; +import org.apache.flink.connector.base.source.reader.splitreader.SplitReader; import org.apache.flink.connector.file.src.util.Pool; -import org.apache.flink.connector.file.src.util.RecordAndPosition; import org.apache.iceberg.flink.FlinkConfigOptions; import org.apache.iceberg.flink.source.DataIterator; import org.apache.iceberg.flink.source.Position; import org.apache.iceberg.io.CloseableIterator; +/** + * FLIP-27's {@link SplitReader#fetch()} returns batched {@link RecordsWithSplitIds} + * {@link DataIterator} can return reused object, like {@code RowData}. In order to + * work with batched fetch API, we need to store cloned objects into object pools. + */ class ArrayPoolDataIteratorBatcher implements DataIteratorBatcher { private final Configuration config; private final RecordFactory recordFactory; @@ -41,7 +46,7 @@ class ArrayPoolDataIteratorBatcher implements DataIteratorBatcher { } @Override - public CloseableIterator>> apply( + public CloseableIterator>> batch( String splitId, DataIterator inputIterator) { return new ArrayPoolBatchIterator(splitId, inputIterator); } @@ -70,6 +75,9 @@ public RecordsWithSplitIds> next() { final T[] batch = getCachedEntry(); int num = 0; while (inputIterator.hasNext() && num < batchSize) { + // The record produced by inputIterator can be reused like for the RowData case. + // inputIterator.next() can't be called again until the copy is made + // since the record is not consumed immediately. T nextRecord = inputIterator.next(); recordFactory.clone(nextRecord, batch[num]); num++; @@ -79,6 +87,7 @@ public RecordsWithSplitIds> next() { break; } } + if (num == 0) { return null; } else { diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java index 0f0d16a6ab92..f95a7f95e669 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorBatcher.java @@ -22,15 +22,14 @@ import java.io.Serializable; import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; import org.apache.flink.connector.base.source.reader.splitreader.SplitReader; -import org.apache.flink.connector.file.src.util.RecordAndPosition; import org.apache.iceberg.flink.source.DataIterator; import org.apache.iceberg.io.CloseableIterator; /** * Batcher converts iterator of T into iterator of batched {@code RecordsWithSplitIds>}, - * which is what FLIP-27's {@link SplitReader#fetch()} returns. + * as FLIP-27's {@link SplitReader#fetch()} returns batched records. */ @FunctionalInterface public interface DataIteratorBatcher extends Serializable { - CloseableIterator>> apply(String splitId, DataIterator inputIterator); + CloseableIterator>> batch(String splitId, DataIterator inputIterator); } diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java index 5d32a359ff51..95c65bde1974 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/DataIteratorReaderFunction.java @@ -20,7 +20,6 @@ package org.apache.iceberg.flink.source.reader; import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.file.src.util.RecordAndPosition; import org.apache.iceberg.flink.source.DataIterator; import org.apache.iceberg.flink.source.split.IcebergSourceSplit; import org.apache.iceberg.io.CloseableIterator; @@ -43,7 +42,7 @@ public CloseableIterator>> apply(Iceber if (split.position() != null) { inputIterator.seek(split.position()); } - return batcher.apply(split.splitId(), inputIterator); + return batcher.batch(split.splitId(), inputIterator); } } diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java index 3003100968ed..10fe9483f171 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/IcebergSourceSplitReader.java @@ -28,7 +28,6 @@ import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; import org.apache.flink.connector.base.source.reader.splitreader.SplitReader; import org.apache.flink.connector.base.source.reader.splitreader.SplitsChange; -import org.apache.flink.connector.file.src.util.RecordAndPosition; import org.apache.iceberg.flink.source.split.IcebergSourceSplit; import org.apache.iceberg.io.CloseableIterator; import org.slf4j.Logger; diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/MutableRecordAndPosition.junk b/flink/src/main/java/org/apache/iceberg/flink/source/reader/MutableRecordAndPosition.junk new file mode 100644 index 000000000000..ab3fde206017 --- /dev/null +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/MutableRecordAndPosition.junk @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.flink.source.reader; + +import org.apache.flink.annotation.Internal; + +/** + * A mutable version of the {@link RecordAndPosition}. + * + *

    This mutable object is useful in cases where only once instance of a {@code RecordAndPosition} + * is needed at a time, like for the result values of the {@link RecyclableArrayIterator}. + */ +@Internal +public class MutableRecordAndPosition extends RecordAndPosition { + + /** Updates the record and position in this object. */ + public void set(T record, int fileOffset, long recordOffset) { + this.record = record; + this.fileOffset = fileOffset; + this.recordOffset = recordOffset; + } + + /** Sets the position without setting a record. */ + public void position(int fileOffset, long recordOffset) { + this.fileOffset = fileOffset; + this.recordOffset = recordOffset; + } + + /** Sets the next record of a sequence. This increments the {@code recordOffset} by one. */ + public void record(T record) { + this.record = record; + this.recordOffset++; + } +} diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java index d8add3d257cc..b008f6f5c7fa 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/ReaderFunction.java @@ -22,7 +22,6 @@ import java.io.Serializable; import java.util.function.Function; import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.file.src.util.RecordAndPosition; import org.apache.iceberg.flink.source.split.IcebergSourceSplit; import org.apache.iceberg.io.CloseableIterator; diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java new file mode 100644 index 000000000000..e0980ca54c93 --- /dev/null +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecordAndPosition.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.flink.source.reader; + +import org.apache.flink.annotation.Internal; + +/** + * A record along with the reader position to be stored in the checkpoint. + * + *

    The position defines the point in the reader AFTER the record. Record processing and updating + * checkpointed state happens atomically. The position points to where the reader should resume + * after this record is processed. + * + *

    This mutable object is useful in cases where only once instance of a {@code RecordAndPosition} + * is needed at a time, like for the result values of the {@link RecyclableArrayIterator}. + */ +@Internal +public class RecordAndPosition { + private T record; + private int fileOffset; + private long recordOffset; + + public RecordAndPosition(T record, int fileOffset, long recordOffset) { + this.record = record; + this.fileOffset = fileOffset; + this.recordOffset = recordOffset; + } + + public RecordAndPosition() { + } + + // ------------------------------------------------------------------------ + + public T record() { + return record; + } + + public int fileOffset() { + return fileOffset; + } + + public long recordOffset() { + return recordOffset; + } + + /** Updates the record and position in this object. */ + public void set(T newRecord, int newFileOffset, long newRecordOffset) { + this.record = newRecord; + this.fileOffset = newFileOffset; + this.recordOffset = newRecordOffset; + } + + /** Sets the position without setting a record. */ + public void position(int newFileOffset, long newRecordOffset) { + this.fileOffset = newFileOffset; + this.recordOffset = newRecordOffset; + } + + /** Sets the next record of a sequence. This increments the {@code recordOffset} by one. */ + public void record(T nextRecord) { + this.record = nextRecord; + this.recordOffset++; + } + + @Override + public String toString() { + return String.format("%s @ %d + %d", record, fileOffset, recordOffset); + } + +} diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecyclableArrayIterator.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecyclableArrayIterator.java index 9ffe6654df31..0a197c731610 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecyclableArrayIterator.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/RecyclableArrayIterator.java @@ -21,17 +21,14 @@ import javax.annotation.Nullable; import org.apache.flink.connector.file.src.util.ArrayResultIterator; -import org.apache.flink.connector.file.src.util.CheckpointedPosition; -import org.apache.flink.connector.file.src.util.MutableRecordAndPosition; import org.apache.flink.connector.file.src.util.Pool; -import org.apache.flink.connector.file.src.util.RecordAndPosition; import org.apache.iceberg.io.CloseableIterator; /** * Similar to the {@link ArrayResultIterator}. * Main difference is the records array can be recycled back to a pool. * - * Each record's {@link RecordAndPosition} will have the same fileOffset (for {@link RecordAndPosition#getOffset()}. + * Each record's {@link RecordAndPosition} will have the same fileOffset (for {@link RecordAndPosition#fileOffset()}. * The first returned record will have a records-to-skip count of {@code recordOffset + 1}, following * the contract that each record needs to point to the position AFTER itself * (because a checkpoint taken after the record was emitted needs to resume from after that record). @@ -40,21 +37,21 @@ final class RecyclableArrayIterator implements CloseableIterator recycler; private final E[] records; private final int num; - private final MutableRecordAndPosition recordAndPosition; + private final RecordAndPosition recordAndPosition; private int pos; RecyclableArrayIterator(Pool.Recycler recycler) { - this(recycler, null, 0, CheckpointedPosition.NO_OFFSET, 0L); + this(recycler, null, 0, -1, 0L); } RecyclableArrayIterator( - Pool.Recycler recycler, final E[] newRecords, - final int newNum, final long fileOffset, final long recordOffset) { + Pool.Recycler recycler, E[] newRecords, + int newNum, int fileOffset, long recordOffset) { this.recycler = recycler; this.records = newRecords; this.num = newNum; - this.recordAndPosition = new MutableRecordAndPosition<>(); + this.recordAndPosition = new RecordAndPosition<>(); this.recordAndPosition.set(null, fileOffset, recordOffset); this.pos = 0; @@ -69,7 +66,7 @@ public boolean hasNext() { @Nullable public RecordAndPosition next() { if (pos < num) { - recordAndPosition.setNext(records[pos++]); + recordAndPosition.record(records[pos++]); return recordAndPosition; } else { return null; diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitRecords.java b/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitRecords.java index 22e3de1a4eb0..25eeccef3774 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitRecords.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/reader/SplitRecords.java @@ -25,7 +25,6 @@ import javax.annotation.Nullable; import org.apache.flink.annotation.Internal; import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.file.src.util.RecordAndPosition; import org.apache.iceberg.io.CloseableIterator; /** diff --git a/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java b/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java index dc1e77352b07..3344dd4bef11 100644 --- a/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java +++ b/flink/src/main/java/org/apache/iceberg/flink/source/split/IcebergSourceSplit.java @@ -54,11 +54,11 @@ public IcebergSourceSplit(CombinedScanTask task, Position position) { } public static IcebergSourceSplit fromCombinedScanTask(CombinedScanTask combinedScanTask) { - return fromCombinedScanTask(combinedScanTask, 0L, 0L); + return fromCombinedScanTask(combinedScanTask, 0, 0L); } public static IcebergSourceSplit fromCombinedScanTask( - CombinedScanTask combinedScanTask, long fileOffset, long recordOffset) { + CombinedScanTask combinedScanTask, int fileOffset, long recordOffset) { return new IcebergSourceSplit(combinedScanTask, new Position(fileOffset, recordOffset)); } @@ -85,7 +85,7 @@ public String splitId() { .toString(); } - public void updatePosition(long newFileOffset, long newRecordOffset) { + public void updatePosition(int newFileOffset, long newRecordOffset) { position.update(newFileOffset, newRecordOffset); } diff --git a/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java b/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java index 9ee66f3374cb..ae4862f7f3cc 100644 --- a/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java +++ b/flink/src/test/java/org/apache/iceberg/flink/source/reader/ReaderFunctionTestBase.java @@ -24,7 +24,6 @@ import java.util.Collection; import java.util.List; import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; -import org.apache.flink.connector.file.src.util.RecordAndPosition; import org.apache.iceberg.BaseCombinedScanTask; import org.apache.iceberg.CombinedScanTask; import org.apache.iceberg.DataFile; @@ -137,16 +136,16 @@ public static IcebergSourceSplit sortFilesAsAppendOrder(IcebergSourceSplit split */ private List extractRecordsAndAssertPosition( RecordsWithSplitIds> batch, - long expectedCount, long exptectedFileOffset, long startRecordOffset) { + long expectedCount, int exptectedFileOffset, long startRecordOffset) { // need to call nextSplit first in order to read the batch batch.nextSplit(); final List records = new ArrayList<>(); long recordOffset = startRecordOffset; RecordAndPosition recordAndPosition; while ((recordAndPosition = batch.nextRecordFromSplit()) != null) { - records.add(recordAndPosition.getRecord()); - Assert.assertEquals("expected file offset", exptectedFileOffset, recordAndPosition.getOffset()); - Assert.assertEquals("expected record offset", recordOffset, recordAndPosition.getRecordSkipCount() - 1); + records.add(recordAndPosition.record()); + Assert.assertEquals("expected file offset", exptectedFileOffset, recordAndPosition.fileOffset()); + Assert.assertEquals("expected record offset", recordOffset, recordAndPosition.recordOffset() - 1); recordOffset++; } Assert.assertEquals("expected record count", expectedCount, records.size()); @@ -159,17 +158,17 @@ public void testNoCheckpointedPosition() throws IOException { final CloseableIterator>> reader = readerFunction().apply(split); final RecordsWithSplitIds> batch0 = reader.next(); - final List actual0 = extractRecordsAndAssertPosition(batch0, recordBatchList.get(0).size(), 0L, 0L); + final List actual0 = extractRecordsAndAssertPosition(batch0, recordBatchList.get(0).size(), 0, 0L); assertRecords(recordBatchList.get(0), actual0, TestFixtures.SCHEMA); batch0.recycle(); final RecordsWithSplitIds> batch1 = reader.next(); - final List actual1 = extractRecordsAndAssertPosition(batch1, recordBatchList.get(1).size(), 1L, 0L); + final List actual1 = extractRecordsAndAssertPosition(batch1, recordBatchList.get(1).size(), 1, 0L); assertRecords(recordBatchList.get(1), actual1, TestFixtures.SCHEMA); batch1.recycle(); final RecordsWithSplitIds> batch2 = reader.next(); - final List actual2 = extractRecordsAndAssertPosition(batch2, recordBatchList.get(2).size(), 2L, 0L); + final List actual2 = extractRecordsAndAssertPosition(batch2, recordBatchList.get(2).size(), 2, 0L); assertRecords(recordBatchList.get(2), actual2, TestFixtures.SCHEMA); batch2.recycle(); } @@ -178,21 +177,21 @@ public void testNoCheckpointedPosition() throws IOException { public void testCheckpointedPositionBeforeFirstFile() throws IOException { final IcebergSourceSplit split = new IcebergSourceSplit( icebergSplit.task(), - new Position(0L, 0L)); + new Position(0, 0L)); final CloseableIterator>> reader = readerFunction().apply(split); final RecordsWithSplitIds> batch0 = reader.next(); - final List actual0 = extractRecordsAndAssertPosition(batch0, recordBatchList.get(0).size(), 0L, 0L); + final List actual0 = extractRecordsAndAssertPosition(batch0, recordBatchList.get(0).size(), 0, 0L); assertRecords(recordBatchList.get(0), actual0, TestFixtures.SCHEMA); batch0.recycle(); final RecordsWithSplitIds> batch1 = reader.next(); - final List actual1 = extractRecordsAndAssertPosition(batch1, recordBatchList.get(1).size(), 1L, 0L); + final List actual1 = extractRecordsAndAssertPosition(batch1, recordBatchList.get(1).size(), 1, 0L); assertRecords(recordBatchList.get(1), actual1, TestFixtures.SCHEMA); batch1.recycle(); final RecordsWithSplitIds> batch2 = reader.next(); - final List actual2 = extractRecordsAndAssertPosition(batch2, recordBatchList.get(2).size(), 2L, 0L); + final List actual2 = extractRecordsAndAssertPosition(batch2, recordBatchList.get(2).size(), 2, 0L); assertRecords(recordBatchList.get(2), actual2, TestFixtures.SCHEMA); batch2.recycle(); } @@ -201,57 +200,55 @@ public void testCheckpointedPositionBeforeFirstFile() throws IOException { public void testCheckpointedPositionMiddleFirstFile() throws IOException { final IcebergSourceSplit split = new IcebergSourceSplit( icebergSplit.task(), - new Position(0L, 1L)); + new Position(0, 1L)); final CloseableIterator>> reader = readerFunction().apply(split); final RecordsWithSplitIds> batch0 = reader.next(); - final List actual0 = extractRecordsAndAssertPosition(batch0, 1L, 0L, 1L); + final List actual0 = extractRecordsAndAssertPosition(batch0, 1L, 0, 1L); assertRecords(recordBatchList.get(0).subList(1, 2), actual0, TestFixtures.SCHEMA); batch0.recycle(); final RecordsWithSplitIds> batch1 = reader.next(); - final List actual1 = extractRecordsAndAssertPosition(batch1, recordBatchList.get(1).size(), 1L, 0L); + final List actual1 = extractRecordsAndAssertPosition(batch1, recordBatchList.get(1).size(), 1, 0L); assertRecords(recordBatchList.get(1), actual1, TestFixtures.SCHEMA); batch1.recycle(); final RecordsWithSplitIds> batch2 = reader.next(); - final List actual2 = extractRecordsAndAssertPosition(batch2, recordBatchList.get(2).size(), 2L, 0L); + final List actual2 = extractRecordsAndAssertPosition(batch2, recordBatchList.get(2).size(), 2, 0L); assertRecords(recordBatchList.get(2), actual2, TestFixtures.SCHEMA); batch2.recycle(); } @Test public void testCheckpointedPositionAfterFirstFile() throws IOException { - final IcebergSourceSplit split = new IcebergSourceSplit( - icebergSplit.task(), - new Position(0L, 2L)); + final IcebergSourceSplit split = new IcebergSourceSplit(icebergSplit.task(), + new Position(0, 2L)); final CloseableIterator>> reader = readerFunction().apply(split); final RecordsWithSplitIds> batch0 = reader.next(); - final List actual1 = extractRecordsAndAssertPosition(batch0, recordBatchList.get(1).size(), 1L, 0L); + final List actual1 = extractRecordsAndAssertPosition(batch0, recordBatchList.get(1).size(), 1, 0L); assertRecords(recordBatchList.get(1), actual1, TestFixtures.SCHEMA); batch0.recycle(); final RecordsWithSplitIds> batch2 = reader.next(); - final List actual2 = extractRecordsAndAssertPosition(batch2, recordBatchList.get(2).size(), 2L, 0L); + final List actual2 = extractRecordsAndAssertPosition(batch2, recordBatchList.get(2).size(), 2, 0L); assertRecords(recordBatchList.get(2), actual2, TestFixtures.SCHEMA); batch2.recycle(); } @Test public void testCheckpointedPositionBeforeSecondFile() throws IOException { - final IcebergSourceSplit split = new IcebergSourceSplit( - icebergSplit.task(), - new Position(1L, 0L)); + final IcebergSourceSplit split = new IcebergSourceSplit(icebergSplit.task(), + new Position(1, 0L)); final CloseableIterator>> reader = readerFunction().apply(split); final RecordsWithSplitIds> batch1 = reader.next(); - final List actual1 = extractRecordsAndAssertPosition(batch1, recordBatchList.get(1).size(), 1L, 0L); + final List actual1 = extractRecordsAndAssertPosition(batch1, recordBatchList.get(1).size(), 1, 0L); assertRecords(recordBatchList.get(1), actual1, TestFixtures.SCHEMA); batch1.recycle(); final RecordsWithSplitIds> batch2 = reader.next(); - final List actual2 = extractRecordsAndAssertPosition(batch2, recordBatchList.get(2).size(), 2L, 0L); + final List actual2 = extractRecordsAndAssertPosition(batch2, recordBatchList.get(2).size(), 2, 0L); assertRecords(recordBatchList.get(2), actual2, TestFixtures.SCHEMA); batch2.recycle(); } @@ -260,16 +257,16 @@ public void testCheckpointedPositionBeforeSecondFile() throws IOException { public void testCheckpointedPositionMidSecondFile() throws IOException { final IcebergSourceSplit split = new IcebergSourceSplit( icebergSplit.task(), - new Position(1L, 1L)); + new Position(1, 1L)); final CloseableIterator>> reader = readerFunction().apply(split); final RecordsWithSplitIds> batch1 = reader.next(); - final List actual1 = extractRecordsAndAssertPosition(batch1, 1L, 1L, 1L); + final List actual1 = extractRecordsAndAssertPosition(batch1, 1L, 1, 1L); assertRecords(recordBatchList.get(1).subList(1, 2), actual1, TestFixtures.SCHEMA); batch1.recycle(); final RecordsWithSplitIds> batch2 = reader.next(); - final List actual2 = extractRecordsAndAssertPosition(batch2, recordBatchList.get(2).size(), 2L, 0L); + final List actual2 = extractRecordsAndAssertPosition(batch2, recordBatchList.get(2).size(), 2, 0L); assertRecords(recordBatchList.get(2), actual2, TestFixtures.SCHEMA); batch2.recycle(); } diff --git a/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceSplitReader.java b/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceSplitReader.java index 83d5642f3c00..020d56438c05 100644 --- a/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceSplitReader.java +++ b/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestIcebergSourceSplitReader.java @@ -26,7 +26,6 @@ import org.apache.flink.configuration.Configuration; import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds; import org.apache.flink.connector.base.source.reader.splitreader.SplitsAddition; -import org.apache.flink.connector.file.src.util.RecordAndPosition; import org.apache.flink.connector.testutils.source.reader.TestingReaderContext; import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; import org.apache.flink.table.data.RowData; @@ -114,16 +113,16 @@ public void testFullScan() throws Exception { reader.handleSplitsChanges(new SplitsAddition(Arrays.asList(split))); final RecordsWithSplitIds> readBatch0 = reader.fetch(); - final List rowBatch0 = readRows(readBatch0, split.splitId(), 0L, 0L); + final List rowBatch0 = readRows(readBatch0, split.splitId(), 0, 0L); TestHelpers.assertRecords(rowBatch0, recordBatchList.get(0), TestFixtures.SCHEMA); final RecordsWithSplitIds> readBatch1 = reader.fetch(); - final List rowBatch1 = readRows(readBatch1, split.splitId(), 1L, 0L); + final List rowBatch1 = readRows(readBatch1, split.splitId(), 1, 0L); TestHelpers.assertRecords(rowBatch1, recordBatchList.get(1), TestFixtures.SCHEMA); final RecordsWithSplitIds> readBatch2 = reader.fetch(); - final List rowBatch2 = readRows(readBatch2, split.splitId(), 2L, 0L); + final List rowBatch2 = readRows(readBatch2, split.splitId(), 2, 0L); TestHelpers.assertRecords(rowBatch2, recordBatchList.get(2), TestFixtures.SCHEMA); final RecordsWithSplitIds> finishedBatch = reader.fetch(); @@ -133,15 +132,15 @@ public void testFullScan() throws Exception { @Test public void testResumeFromEndOfFirstBatch() throws Exception { - final IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(icebergSplit.task(), 0L, 2L); + final IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(icebergSplit.task(), 0, 2L); reader.handleSplitsChanges(new SplitsAddition(Arrays.asList(split))); final RecordsWithSplitIds> readBatch1 = reader.fetch(); - final List rowBatch1 = readRows(readBatch1, split.splitId(), 1L, 0L); + final List rowBatch1 = readRows(readBatch1, split.splitId(), 1, 0L); TestHelpers.assertRecords(rowBatch1, recordBatchList.get(1), TestFixtures.SCHEMA); final RecordsWithSplitIds> readBatch2 = reader.fetch(); - final List rowBatch2 = readRows(readBatch2, split.splitId(), 2L, 0L); + final List rowBatch2 = readRows(readBatch2, split.splitId(), 2, 0L); TestHelpers.assertRecords(rowBatch2, recordBatchList.get(2), TestFixtures.SCHEMA); final RecordsWithSplitIds> finishedBatch = reader.fetch(); @@ -151,15 +150,15 @@ public void testResumeFromEndOfFirstBatch() throws Exception { @Test public void testResumeFromStartOfSecondBatch() throws Exception { - final IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(icebergSplit.task(), 1L, 0L); + final IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(icebergSplit.task(), 1, 0L); reader.handleSplitsChanges(new SplitsAddition(Arrays.asList(split))); final RecordsWithSplitIds> readBatch1 = reader.fetch(); - final List rowBatch1 = readRows(readBatch1, split.splitId(), 1L, 0L); + final List rowBatch1 = readRows(readBatch1, split.splitId(), 1, 0L); TestHelpers.assertRecords(rowBatch1, recordBatchList.get(1), TestFixtures.SCHEMA); final RecordsWithSplitIds> readBatch2 = reader.fetch(); - final List rowBatch2 = readRows(readBatch2, split.splitId(), 2L, 0L); + final List rowBatch2 = readRows(readBatch2, split.splitId(), 2, 0L); TestHelpers.assertRecords(rowBatch2, recordBatchList.get(2), TestFixtures.SCHEMA); final RecordsWithSplitIds> finishedBatch @@ -170,15 +169,15 @@ public void testResumeFromStartOfSecondBatch() throws Exception { @Test public void testResumeFromMiddleOfSecondBatch() throws Exception { - final IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(icebergSplit.task(), 1L, 1L); + final IcebergSourceSplit split = IcebergSourceSplit.fromCombinedScanTask(icebergSplit.task(), 1, 1L); reader.handleSplitsChanges(new SplitsAddition(Arrays.asList(split))); final RecordsWithSplitIds> readBatch1 = reader.fetch(); - final List rowBatch1 = readRows(readBatch1, split.splitId(), 1L, 1L); + final List rowBatch1 = readRows(readBatch1, split.splitId(), 1, 1L); TestHelpers.assertRecords(rowBatch1, recordBatchList.get(1).subList(1, 2), TestFixtures.SCHEMA); final RecordsWithSplitIds> readBatch2 = reader.fetch(); - final List rowBatch2 = readRows(readBatch2, split.splitId(), 2L, 0L); + final List rowBatch2 = readRows(readBatch2, split.splitId(), 2, 0L); TestHelpers.assertRecords(rowBatch2, recordBatchList.get(2), TestFixtures.SCHEMA); final RecordsWithSplitIds> finishedBatch @@ -189,16 +188,16 @@ public void testResumeFromMiddleOfSecondBatch() throws Exception { private List readRows( RecordsWithSplitIds> readBatch, - String expectedSplitId, long expectedOffset, long expectedStartingRecordOffset) { + String expectedSplitId, int expectedFileOffset, long expectedStartingRecordOffset) { Assert.assertEquals(expectedSplitId, readBatch.nextSplit()); final List rowDataList = new ArrayList<>(); RecordAndPosition row; int num = 0; while ((row = readBatch.nextRecordFromSplit()) != null) { - Assert.assertEquals(expectedOffset, row.getOffset()); + Assert.assertEquals(expectedFileOffset, row.fileOffset()); num++; - Assert.assertEquals(expectedStartingRecordOffset + num, row.getRecordSkipCount()); - rowDataList.add(row.getRecord()); + Assert.assertEquals(expectedStartingRecordOffset + num, row.recordOffset()); + rowDataList.add(row.record()); } readBatch.recycle(); return TestHelpers.convertRowDataToRow(rowDataList, TestFixtures.ROW_TYPE); diff --git a/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRecyclableArrayIterator.java b/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRecyclableArrayIterator.java index eb3503ed5371..bf36efcfdc34 100644 --- a/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRecyclableArrayIterator.java +++ b/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestRecyclableArrayIterator.java @@ -20,7 +20,6 @@ package org.apache.iceberg.flink.source.reader; import java.util.concurrent.atomic.AtomicBoolean; -import org.apache.flink.connector.file.src.util.RecordAndPosition; import org.junit.Assert; import org.junit.Test; @@ -37,7 +36,7 @@ public void testEmptyConstruction() { @Test public void testGetElements() { final String[] elements = new String[]{"1", "2", "3", "4"}; - final long initialOffset = 3; + final int initialOffset = 3; final long initialSkipCount = 17; // dummy recycler @@ -46,9 +45,9 @@ public void testGetElements() { for (int i = 0; i < elements.length; i++) { final RecordAndPosition recAndPos = iter.next(); - Assert.assertEquals(elements[i], recAndPos.getRecord()); - Assert.assertEquals(initialOffset, recAndPos.getOffset()); - Assert.assertEquals(initialSkipCount + i + 1, recAndPos.getRecordSkipCount()); + Assert.assertEquals(elements[i], recAndPos.record()); + Assert.assertEquals(initialOffset, recAndPos.fileOffset()); + Assert.assertEquals(initialSkipCount + i + 1, recAndPos.recordOffset()); } } @@ -56,7 +55,7 @@ public void testGetElements() { public void testExhausted() { // dummy recycler final RecyclableArrayIterator iter = new RecyclableArrayIterator<>( - ignored -> System.currentTimeMillis(), new String[]{"1", "2"}, 2, 0L, 0L); + ignored -> System.currentTimeMillis(), new String[]{"1", "2"}, 2, 0, 0L); iter.next(); iter.next(); @@ -68,7 +67,7 @@ public void testExhausted() { public void testArraySubRange() { // dummy recycler final RecyclableArrayIterator iter = new RecyclableArrayIterator<>(ignored -> System.currentTimeMillis(), - new String[]{"1", "2", "3"}, 2, 0L, 0L); + new String[]{"1", "2", "3"}, 2, 0, 0L); Assert.assertNotNull(iter.next()); Assert.assertNotNull(iter.next());