-
Notifications
You must be signed in to change notification settings - Fork 13.9k
[FLINK-12883][WIP][runtime] Add elaborated partition release logic #8804
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
571ce50
8a3f540
86268c6
16868f0
669d1de
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -48,13 +48,17 @@ | |
| import org.apache.flink.runtime.execution.SuppressRestartsException; | ||
| import org.apache.flink.runtime.executiongraph.failover.FailoverStrategy; | ||
| import org.apache.flink.runtime.executiongraph.failover.RestartAllStrategy; | ||
| import org.apache.flink.runtime.executiongraph.failover.adapter.DefaultFailoverTopology; | ||
| import org.apache.flink.runtime.executiongraph.failover.flip1.partitionrelease.NotReleasingPartitionReleaseStrategy; | ||
| import org.apache.flink.runtime.executiongraph.failover.flip1.partitionrelease.PartitionReleaseStrategy; | ||
| import org.apache.flink.runtime.executiongraph.restart.ExecutionGraphRestartCallback; | ||
| import org.apache.flink.runtime.executiongraph.restart.RestartCallback; | ||
| import org.apache.flink.runtime.executiongraph.restart.RestartStrategy; | ||
| import org.apache.flink.runtime.io.network.partition.PartitionTracker; | ||
| import org.apache.flink.runtime.io.network.partition.PartitionTrackerImpl; | ||
| import org.apache.flink.runtime.io.network.partition.ResultPartitionID; | ||
| import org.apache.flink.runtime.jobgraph.IntermediateDataSetID; | ||
| import org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID; | ||
| import org.apache.flink.runtime.jobgraph.JobStatus; | ||
| import org.apache.flink.runtime.jobgraph.JobVertex; | ||
| import org.apache.flink.runtime.jobgraph.JobVertexID; | ||
|
|
@@ -66,6 +70,11 @@ | |
| import org.apache.flink.runtime.jobmaster.slotpool.Scheduler; | ||
| import org.apache.flink.runtime.jobmaster.slotpool.SlotProvider; | ||
| import org.apache.flink.runtime.query.KvStateLocationRegistry; | ||
| import org.apache.flink.runtime.scheduler.adapter.ExecutionGraphToSchedulingTopologyAdapter; | ||
| import org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID; | ||
| import org.apache.flink.runtime.scheduler.strategy.SchedulingExecutionVertex; | ||
| import org.apache.flink.runtime.scheduler.strategy.SchedulingResultPartition; | ||
| import org.apache.flink.runtime.scheduler.strategy.SchedulingTopology; | ||
| import org.apache.flink.runtime.shuffle.NettyShuffleMaster; | ||
| import org.apache.flink.runtime.shuffle.ShuffleMaster; | ||
| import org.apache.flink.runtime.state.SharedStateRegistry; | ||
|
|
@@ -250,6 +259,12 @@ public class ExecutionGraph implements AccessExecutionGraph { | |
| /** The total number of vertices currently in the execution graph. */ | ||
| private int numVerticesTotal; | ||
|
|
||
| private final PartitionReleaseStrategy.Factory partitionReleaseStrategyFactory; | ||
|
|
||
| private PartitionReleaseStrategy partitionReleaseStrategy; | ||
|
|
||
| private SchedulingTopology schedulingTopology; | ||
|
|
||
| // ------ Configuration of the Execution ------- | ||
|
|
||
| /** Flag to indicate whether the scheduler may queue tasks for execution, or needs to be able | ||
|
|
@@ -413,6 +428,7 @@ public ExecutionGraph( | |
| userClassLoader, | ||
| blobWriter, | ||
| allocationTimeout, | ||
| new NotReleasingPartitionReleaseStrategy.Factory(), | ||
| NettyShuffleMaster.INSTANCE, | ||
| true, | ||
| new PartitionTrackerImpl( | ||
|
|
@@ -433,6 +449,7 @@ public ExecutionGraph( | |
| ClassLoader userClassLoader, | ||
| BlobWriter blobWriter, | ||
| Time allocationTimeout, | ||
| PartitionReleaseStrategy.Factory partitionReleaseStrategyFactory, | ||
| ShuffleMaster<?> shuffleMaster, | ||
| boolean forcePartitionReleaseOnConsumption, | ||
| PartitionTracker partitionTracker) throws IOException { | ||
|
|
@@ -464,6 +481,8 @@ public ExecutionGraph( | |
| this.rpcTimeout = checkNotNull(rpcTimeout); | ||
| this.allocationTimeout = checkNotNull(allocationTimeout); | ||
|
|
||
| this.partitionReleaseStrategyFactory = checkNotNull(partitionReleaseStrategyFactory); | ||
|
|
||
| this.restartStrategy = restartStrategy; | ||
| this.kvStateLocationRegistry = new KvStateLocationRegistry(jobInformation.getJobId(), getAllVertices()); | ||
|
|
||
|
|
@@ -913,6 +932,11 @@ public void attachJobGraph(List<JobVertex> topologiallySorted) throws JobExcepti | |
| } | ||
|
|
||
| failoverStrategy.notifyNewVertices(newExecJobVertices); | ||
|
|
||
| schedulingTopology = new ExecutionGraphToSchedulingTopologyAdapter(this); | ||
| partitionReleaseStrategy = partitionReleaseStrategyFactory.createInstance( | ||
| schedulingTopology, | ||
| new DefaultFailoverTopology(this)); | ||
| } | ||
|
|
||
| public void scheduleForExecution() throws JobException { | ||
|
|
@@ -1605,36 +1629,9 @@ public boolean updateState(TaskExecutionState state) { | |
|
|
||
| if (attempt != null) { | ||
| try { | ||
| Map<String, Accumulator<?, ?>> accumulators; | ||
|
|
||
| switch (state.getExecutionState()) { | ||
| case RUNNING: | ||
| return attempt.switchToRunning(); | ||
|
|
||
| case FINISHED: | ||
| // this deserialization is exception-free | ||
| accumulators = deserializeAccumulators(state); | ||
| attempt.markFinished(accumulators, state.getIOMetrics()); | ||
| return true; | ||
|
|
||
| case CANCELED: | ||
| // this deserialization is exception-free | ||
| accumulators = deserializeAccumulators(state); | ||
| attempt.completeCancelling(accumulators, state.getIOMetrics()); | ||
| return true; | ||
|
|
||
| case FAILED: | ||
| // this deserialization is exception-free | ||
| accumulators = deserializeAccumulators(state); | ||
| attempt.markFailed(state.getError(userClassLoader), accumulators, state.getIOMetrics()); | ||
| return true; | ||
|
|
||
| default: | ||
| // we mark as failed and return false, which triggers the TaskManager | ||
| // to remove the task | ||
| attempt.fail(new Exception("TaskManager sent illegal state update: " + state.getExecutionState())); | ||
| return false; | ||
| } | ||
| final boolean stateUpdated = updateStateInternal(state, attempt); | ||
| maybeReleasePartitions(attempt); | ||
| return stateUpdated; | ||
| } | ||
| catch (Throwable t) { | ||
| ExceptionUtils.rethrowIfFatalErrorOrOOM(t); | ||
|
|
@@ -1649,6 +1646,77 @@ public boolean updateState(TaskExecutionState state) { | |
| } | ||
| } | ||
|
|
||
| private boolean updateStateInternal(final TaskExecutionState state, final Execution attempt) { | ||
| Map<String, Accumulator<?, ?>> accumulators; | ||
|
|
||
| switch (state.getExecutionState()) { | ||
| case RUNNING: | ||
| return attempt.switchToRunning(); | ||
|
|
||
| case FINISHED: | ||
| // this deserialization is exception-free | ||
| accumulators = deserializeAccumulators(state); | ||
| attempt.markFinished(accumulators, state.getIOMetrics()); | ||
| return true; | ||
|
|
||
| case CANCELED: | ||
| // this deserialization is exception-free | ||
| accumulators = deserializeAccumulators(state); | ||
| attempt.completeCancelling(accumulators, state.getIOMetrics()); | ||
| return true; | ||
|
|
||
| case FAILED: | ||
| // this deserialization is exception-free | ||
| accumulators = deserializeAccumulators(state); | ||
| attempt.markFailed(state.getError(userClassLoader), accumulators, state.getIOMetrics()); | ||
| return true; | ||
|
|
||
| default: | ||
| // we mark as failed and return false, which triggers the TaskManager | ||
| // to remove the task | ||
| attempt.fail(new Exception("TaskManager sent illegal state update: " + state.getExecutionState())); | ||
| return false; | ||
| } | ||
| } | ||
|
|
||
| private void maybeReleasePartitions(final Execution attempt) { | ||
| final ExecutionVertexID finishedExecutionVertex = attempt.getVertex().getID(); | ||
|
|
||
| if (attempt.getState() == ExecutionState.FINISHED) { | ||
| final List<IntermediateResultPartitionID> releasablePartitions = partitionReleaseStrategy.vertexFinished(finishedExecutionVertex); | ||
| releasePartitions(releasablePartitions); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might be easy for constructing
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I missed it. The current attempt is consumer, and we need the correspond producer attempt for the partition.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I still felt the following Another options is that
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not worth it at the moment because it is not even clear whether it is a good thing to use both
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We might only need If we add the |
||
| } else { | ||
| partitionReleaseStrategy.vertexUnfinished(finishedExecutionVertex); | ||
| } | ||
| } | ||
|
|
||
| private void releasePartitions(final List<IntermediateResultPartitionID> releasablePartitions) { | ||
| if (releasablePartitions.size() > 0) { | ||
| final List<ResultPartitionID> partitionIds = releasablePartitions.stream() | ||
| .map(this::createResultPartitionId) | ||
| .collect(Collectors.toList()); | ||
|
|
||
| partitionTracker.stopTrackingAndReleasePartitions(partitionIds); | ||
| } | ||
| } | ||
|
|
||
| private ResultPartitionID createResultPartitionId(final IntermediateResultPartitionID resultPartitionId) { | ||
| final SchedulingResultPartition schedulingResultPartition = schedulingTopology.getResultPartitionOrThrow(resultPartitionId); | ||
| final SchedulingExecutionVertex producer = schedulingResultPartition.getProducer(); | ||
| final ExecutionVertexID producerId = producer.getId(); | ||
| final JobVertexID jobVertexId = producerId.getJobVertexId(); | ||
| final ExecutionJobVertex jobVertex = getJobVertex(jobVertexId); | ||
| checkNotNull(jobVertex, "Unknown job vertex %s", jobVertexId); | ||
|
|
||
| final ExecutionVertex[] taskVertices = jobVertex.getTaskVertices(); | ||
| final int subtaskIndex = producerId.getSubtaskIndex(); | ||
| checkState(subtaskIndex < taskVertices.length, "Invalid subtask index %d for job vertex %s", subtaskIndex, jobVertexId); | ||
|
|
||
| final ExecutionVertex taskVertex = taskVertices[subtaskIndex]; | ||
| final Execution execution = taskVertex.getCurrentExecutionAttempt(); | ||
| return new ResultPartitionID(resultPartitionId, execution.getAttemptId()); | ||
| } | ||
|
|
||
| /** | ||
| * Deserializes accumulators from a task state update. | ||
| * | ||
|
|
@@ -1835,4 +1903,8 @@ ShuffleMaster<?> getShuffleMaster() { | |
| public PartitionTracker getPartitionTracker() { | ||
| return partitionTracker; | ||
| } | ||
|
|
||
| PartitionReleaseStrategy getPartitionReleaseStrategy() { | ||
| return partitionReleaseStrategy; | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about initializing
schedulingTopologyandpartitionReleaseStrategyin the constructor, then we could make them final, and no need to maintain class-levelpartitionReleaseStrategyFactory.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is not possible because the
JobGraphis attached to theExecutionGraphin an extra step.ExecutionGraphToSchedulingTopologyAdapteradapts theExecutionGrapheagerly.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I have not looked through the details in
ExecutionGraphToSchedulingTopologyAdapterbefore, only saw it reliesExecutionGraph, actually it relies some infos after attachingJobGraph.