-
Notifications
You must be signed in to change notification settings - Fork 2.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Flink: store watermark as iceberg table's property #2109
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,12 +30,14 @@ | |
import org.apache.flink.api.common.state.ListStateDescriptor; | ||
import org.apache.flink.api.common.typeinfo.BasicTypeInfo; | ||
import org.apache.flink.api.common.typeinfo.PrimitiveArrayTypeInfo; | ||
import org.apache.flink.api.java.typeutils.MapTypeInfo; | ||
import org.apache.flink.core.io.SimpleVersionedSerialization; | ||
import org.apache.flink.runtime.state.StateInitializationContext; | ||
import org.apache.flink.runtime.state.StateSnapshotContext; | ||
import org.apache.flink.streaming.api.operators.AbstractStreamOperator; | ||
import org.apache.flink.streaming.api.operators.BoundedOneInput; | ||
import org.apache.flink.streaming.api.operators.OneInputStreamOperator; | ||
import org.apache.flink.streaming.api.watermark.Watermark; | ||
import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; | ||
import org.apache.flink.table.runtime.typeutils.SortedMapTypeInfo; | ||
import org.apache.iceberg.AppendFiles; | ||
|
@@ -55,6 +57,7 @@ | |
import org.apache.iceberg.relocated.com.google.common.collect.Maps; | ||
import org.apache.iceberg.types.Comparators; | ||
import org.apache.iceberg.types.Types; | ||
import org.apache.iceberg.util.PropertyUtil; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
|
@@ -73,6 +76,8 @@ class IcebergFilesCommitter extends AbstractStreamOperator<Void> | |
// avoiding committing the same data files twice. This id will be attached to iceberg's meta when committing the | ||
// iceberg transaction. | ||
private static final String MAX_COMMITTED_CHECKPOINT_ID = "flink.max-committed-checkpoint-id"; | ||
static final String CURRENT_WATERMARK = "flink.current-watermark"; | ||
static final String STORE_WATERMARK = "flink.store-watermark"; | ||
|
||
// TableLoader to load iceberg table lazily. | ||
private final TableLoader tableLoader; | ||
|
@@ -85,6 +90,7 @@ class IcebergFilesCommitter extends AbstractStreamOperator<Void> | |
// any data loss in iceberg table. So we keep the finished files <1, <file0, file1>> in memory and retry to commit | ||
// iceberg table when the next checkpoint happen. | ||
private final NavigableMap<Long, byte[]> dataFilesPerCheckpoint = Maps.newTreeMap(); | ||
private final Map<Long, Long> watermarkPerCheckpoint = Maps.newHashMap(); | ||
|
||
// The completed files cache for current checkpoint. Once the snapshot barrier received, it will be flushed to the | ||
// 'dataFilesPerCheckpoint'. | ||
|
@@ -95,6 +101,8 @@ class IcebergFilesCommitter extends AbstractStreamOperator<Void> | |
private transient Table table; | ||
private transient ManifestOutputFileFactory manifestOutputFileFactory; | ||
private transient long maxCommittedCheckpointId; | ||
private transient long currentWatermark; | ||
private transient boolean storeWatermark; | ||
|
||
// There're two cases that we restore from flink checkpoints: the first case is restoring from snapshot created by the | ||
// same flink job; another case is restoring from snapshot created by another different job. For the second case, we | ||
|
@@ -106,6 +114,9 @@ class IcebergFilesCommitter extends AbstractStreamOperator<Void> | |
// All pending checkpoints states for this function. | ||
private static final ListStateDescriptor<SortedMap<Long, byte[]>> STATE_DESCRIPTOR = buildStateDescriptor(); | ||
private transient ListState<SortedMap<Long, byte[]>> checkpointsState; | ||
private static final ListStateDescriptor<Map<Long, Long>> WATERMARK_DESCRIPTOR = new ListStateDescriptor<>( | ||
"iceberg-flink-watermark", new MapTypeInfo<>(BasicTypeInfo.LONG_TYPE_INFO, BasicTypeInfo.LONG_TYPE_INFO)); | ||
private transient ListState<Map<Long, Long>> watermarkState; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we define a Ideally, I would prefer the metadata and the manifest file bundled in a single class (per checkpoint). That would require complexity of handling state schema evolution, which I am not sure if it is worth the effort. |
||
|
||
IcebergFilesCommitter(TableLoader tableLoader, boolean replacePartitions) { | ||
this.tableLoader = tableLoader; | ||
|
@@ -126,9 +137,16 @@ public void initializeState(StateInitializationContext context) throws Exception | |
this.manifestOutputFileFactory = FlinkManifestUtil.createOutputFileFactory(table, flinkJobId, subTaskId, attemptId); | ||
this.maxCommittedCheckpointId = INITIAL_CHECKPOINT_ID; | ||
|
||
Map<String, String> properties = this.table.properties(); | ||
this.currentWatermark = PropertyUtil.propertyAsLong(properties, CURRENT_WATERMARK, -1L); | ||
this.storeWatermark = PropertyUtil.propertyAsBoolean(properties, STORE_WATERMARK, false); | ||
|
||
this.checkpointsState = context.getOperatorStateStore().getListState(STATE_DESCRIPTOR); | ||
this.jobIdState = context.getOperatorStateStore().getListState(JOB_ID_DESCRIPTOR); | ||
this.watermarkState = context.getOperatorStateStore().getListState(WATERMARK_DESCRIPTOR); | ||
|
||
if (context.isRestored()) { | ||
watermarkPerCheckpoint.putAll(watermarkState.get().iterator().next()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will this be backwards compatible for on going streaming jobs that don't have any Looking at the Flink There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, on further inspection of the I don't have a strong opinion about either point 1 or point 2, but I thought it might be worth bringing up for discussion. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @kbendick, i neglected backwards compatible thing, current code will raise an |
||
String restoredFlinkJobId = jobIdState.get().iterator().next(); | ||
Preconditions.checkState(!Strings.isNullOrEmpty(restoredFlinkJobId), | ||
"Flink job id parsed from checkpoint snapshot shouldn't be null or empty"); | ||
|
@@ -162,6 +180,10 @@ public void snapshotState(StateSnapshotContext context) throws Exception { | |
checkpointsState.clear(); | ||
checkpointsState.add(dataFilesPerCheckpoint); | ||
|
||
watermarkPerCheckpoint.put(checkpointId, currentWatermark); | ||
watermarkState.clear(); | ||
watermarkState.add(watermarkPerCheckpoint); | ||
|
||
jobIdState.clear(); | ||
jobIdState.add(flinkJobId); | ||
|
||
|
@@ -296,6 +318,13 @@ private void commitOperation(SnapshotUpdate<?> operation, int numDataFiles, int | |
|
||
long start = System.currentTimeMillis(); | ||
operation.commit(); // abort is automatically called if this fails. | ||
|
||
Long watermarkForCheckpoint = watermarkPerCheckpoint.get(checkpointId); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to use table transaction here so that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we store watermark in snapshot summary metadata as @rdblue said, we can omit There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We actually set it as table properties too. I think table properties is easier for the workflow scheduler (in the batch system) to query. Otherwise, they have to iterate the snapshots and find out the latest watermarks for all 3 regions. cc @rdblue |
||
if (storeWatermark && watermarkForCheckpoint != null && watermarkForCheckpoint > 0) { | ||
table.updateProperties().set(CURRENT_WATERMARK, String.valueOf(watermarkForCheckpoint)).commit(); | ||
LOG.info("Update {} to {}", CURRENT_WATERMARK, watermarkForCheckpoint); | ||
} | ||
|
||
long duration = System.currentTimeMillis() - start; | ||
LOG.info("Committed in {} ms", duration); | ||
} | ||
|
@@ -305,6 +334,14 @@ public void processElement(StreamRecord<WriteResult> element) { | |
this.writeResultsOfCurrentCkpt.add(element.getValue()); | ||
} | ||
|
||
@Override | ||
public void processWatermark(Watermark mark) throws Exception { | ||
super.processWatermark(mark); | ||
if (mark.getTimestamp() != Watermark.MAX_WATERMARK.getTimestamp()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we need to ignore the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As you said before, we use watermark to indicate the data completeness on the ingestion path, i think we do not need to store |
||
currentWatermark = mark.getTimestamp(); | ||
} | ||
} | ||
|
||
@Override | ||
public void endInput() throws IOException { | ||
// Flush the buffered data files into 'dataFilesPerCheckpoint' firstly. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We probably should use
SortedMap
here.