-
Notifications
You must be signed in to change notification settings - Fork 14.9k
KAFKA-8806 Reduce calls to validateOffsetsIfNeeded #7222
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a6d3f90
b21a304
b2e9cce
c4f75a0
5ee309e
7e31731
4db04ac
8acd79c
cdb4dcc
33776b2
79ad6bc
b60a99a
13d63d8
aa0a77d
0788f6a
c196900
8cfbfa1
89f87c0
293684d
315de4a
d6552ce
6f3ed50
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -155,6 +155,8 @@ public class Fetcher<K, V> implements Closeable { | |
| private final OffsetsForLeaderEpochClient offsetsForLeaderEpochClient; | ||
| private final Set<Integer> nodesWithPendingFetchRequests; | ||
| private final ApiVersions apiVersions; | ||
| private final AtomicInteger metadataUpdateVersion = new AtomicInteger(-1); | ||
|
|
||
|
|
||
| private CompletedFetch nextInLineFetch = null; | ||
|
|
||
|
|
@@ -486,10 +488,8 @@ public void validateOffsetsIfNeeded() { | |
| throw exception; | ||
|
|
||
| // Validate each partition against the current leader and epoch | ||
| subscriptions.assignedPartitions().forEach(topicPartition -> { | ||
| ConsumerMetadata.LeaderAndEpoch leaderAndEpoch = metadata.currentLeader(topicPartition); | ||
| subscriptions.maybeValidatePositionForCurrentLeader(apiVersions, topicPartition, leaderAndEpoch); | ||
| }); | ||
| // If we see a new metadata version, check all partitions | ||
| validatePositionsOnMetadataChange(); | ||
|
|
||
| // Collect positions needing validation, with backoff | ||
| Map<TopicPartition, FetchPosition> partitionsToValidate = subscriptions | ||
|
|
@@ -722,9 +722,11 @@ private List<ConsumerRecord<K, V>> fetchRecords(CompletedFetch completedFetch, i | |
| // Visible for testing | ||
| void resetOffsetIfNeeded(TopicPartition partition, OffsetResetStrategy requestedResetStrategy, ListOffsetData offsetData) { | ||
| FetchPosition position = new FetchPosition( | ||
| offsetData.offset, offsetData.leaderEpoch, metadata.currentLeader(partition)); | ||
| offsetData.offset, | ||
| Optional.empty(), // This will ensure we skip validation | ||
| metadata.currentLeader(partition)); | ||
| offsetData.leaderEpoch.ifPresent(epoch -> metadata.updateLastSeenEpochIfNewer(partition, epoch)); | ||
| subscriptions.maybeSeekUnvalidated(partition, position.offset, requestedResetStrategy); | ||
| subscriptions.maybeSeekUnvalidated(partition, position, requestedResetStrategy); | ||
| } | ||
|
|
||
| private void resetOffsetsAsync(Map<TopicPartition, Long> partitionResetTimestamps) { | ||
|
|
@@ -782,6 +784,7 @@ private void validateOffsetsAsync(Map<TopicPartition, FetchPosition> partitionsT | |
| final Map<Node, Map<TopicPartition, FetchPosition>> regrouped = | ||
| regroupFetchPositionsByLeader(partitionsToValidate); | ||
|
|
||
| long nextResetTimeMs = time.milliseconds() + requestTimeoutMs; | ||
| regrouped.forEach((node, fetchPositions) -> { | ||
| if (node.isEmpty()) { | ||
| metadata.requestUpdate(); | ||
|
|
@@ -804,7 +807,7 @@ private void validateOffsetsAsync(Map<TopicPartition, FetchPosition> partitionsT | |
| return; | ||
| } | ||
|
|
||
| subscriptions.setNextAllowedRetry(fetchPositions.keySet(), time.milliseconds() + requestTimeoutMs); | ||
| subscriptions.setNextAllowedRetry(fetchPositions.keySet(), nextResetTimeMs); | ||
|
|
||
| RequestFuture<OffsetForEpochResult> future = | ||
| offsetsForLeaderEpochClient.sendAsyncRequest(node, fetchPositions); | ||
|
|
@@ -1120,17 +1123,28 @@ Node selectReadReplica(TopicPartition partition, Node leaderReplica, long curren | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * If we have seen new metadata (as tracked by {@link org.apache.kafka.clients.Metadata#updateVersion()}), then | ||
| * we should check that all of the assignments have a valid position. | ||
| */ | ||
| private void validatePositionsOnMetadataChange() { | ||
| int newMetadataUpdateVersion = metadata.updateVersion(); | ||
| if (metadataUpdateVersion.getAndSet(newMetadataUpdateVersion) != newMetadataUpdateVersion) { | ||
| subscriptions.assignedPartitions().forEach(topicPartition -> { | ||
| ConsumerMetadata.LeaderAndEpoch leaderAndEpoch = metadata.currentLeader(topicPartition); | ||
| subscriptions.maybeValidatePositionForCurrentLeader(apiVersions, topicPartition, leaderAndEpoch); | ||
| }); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Create fetch requests for all nodes for which we have assigned partitions | ||
| * that have no existing requests in flight. | ||
| */ | ||
| private Map<Node, FetchSessionHandler.FetchRequestData> prepareFetchRequests() { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure it saves anything in this case. The main benefit of
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can avoid a copy in the case @hachikuji mentions as well, right? See below: synchronized List<TopicPartition> fetchablePartitions(Predicate<TopicPartition> isAvailable) {
return assignment.stream()
.filter(tpState -> isAvailable.test(tpState.topicPartition()) && tpState.value().isFetchable())
.map(PartitionStates.PartitionState::topicPartition)
.collect(Collectors.toList());
}
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah ok, I see what he means. I'll look into this
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I made a pass at this and it wasn't so simple. Deferring for now |
||
| Map<Node, FetchSessionHandler.Builder> fetchable = new LinkedHashMap<>(); | ||
|
|
||
| // Ensure the position has an up-to-date leader | ||
| subscriptions.assignedPartitions().forEach(tp -> | ||
| subscriptions.maybeValidatePositionForCurrentLeader(apiVersions, tp, metadata.currentLeader(tp)) | ||
| ); | ||
| validatePositionsOnMetadataChange(); | ||
|
|
||
| long currentTimeMs = time.milliseconds(); | ||
|
|
||
|
|
@@ -1142,6 +1156,7 @@ private Map<Node, FetchSessionHandler.FetchRequestData> prepareFetchRequests() { | |
|
|
||
| Optional<Node> leaderOpt = position.currentLeader.leader; | ||
| if (!leaderOpt.isPresent()) { | ||
| log.debug("Requesting metadata update for partition {} since the position {} is missing the current leader node", partition, position); | ||
| metadata.requestUpdate(); | ||
| continue; | ||
| } | ||
|
|
@@ -1154,7 +1169,6 @@ private Map<Node, FetchSessionHandler.FetchRequestData> prepareFetchRequests() { | |
| // If we try to send during the reconnect blackout window, then the request is just | ||
| // going to be failed anyway before being sent, so skip the send for now | ||
| log.trace("Skipping fetch for partition {} because node {} is awaiting reconnect backoff", partition, node); | ||
|
|
||
| } else if (this.nodesWithPendingFetchRequests.contains(node.id())) { | ||
| log.trace("Skipping fetch for partition {} because previous request to {} has not been processed", partition, node); | ||
| } else { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -36,6 +36,7 @@ | |
| import java.util.Collections; | ||
| import java.util.HashMap; | ||
| import java.util.HashSet; | ||
| import java.util.Iterator; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.Objects; | ||
|
|
@@ -44,8 +45,6 @@ | |
| import java.util.function.LongSupplier; | ||
| import java.util.function.Predicate; | ||
| import java.util.regex.Pattern; | ||
| import java.util.stream.Collector; | ||
| import java.util.stream.Collectors; | ||
|
|
||
| import static org.apache.kafka.clients.consumer.internals.Fetcher.hasUsableOffsetForLeaderEpochVersion; | ||
|
|
||
|
|
@@ -330,7 +329,7 @@ public synchronized Set<String> subscription() { | |
| } | ||
|
|
||
| public synchronized Set<TopicPartition> pausedPartitions() { | ||
| return collectPartitions(TopicPartitionState::isPaused, Collectors.toSet()); | ||
| return collectPartitions(TopicPartitionState::isPaused); | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -385,7 +384,7 @@ public void seekUnvalidated(TopicPartition tp, FetchPosition position) { | |
| assignedState(tp).seekUnvalidated(position); | ||
| } | ||
|
|
||
| synchronized void maybeSeekUnvalidated(TopicPartition tp, long offset, OffsetResetStrategy requestedResetStrategy) { | ||
| synchronized void maybeSeekUnvalidated(TopicPartition tp, FetchPosition position, OffsetResetStrategy requestedResetStrategy) { | ||
| TopicPartitionState state = assignedStateOrNull(tp); | ||
| if (state == null) { | ||
| log.debug("Skipping reset of partition {} since it is no longer assigned", tp); | ||
|
|
@@ -394,8 +393,8 @@ synchronized void maybeSeekUnvalidated(TopicPartition tp, long offset, OffsetRes | |
| } else if (requestedResetStrategy != state.resetStrategy) { | ||
| log.debug("Skipping reset of partition {} since an alternative reset has been requested", tp); | ||
| } else { | ||
| log.info("Resetting offset for partition {} to offset {}.", tp, offset); | ||
| state.seekUnvalidated(new FetchPosition(offset)); | ||
| log.info("Resetting offset for partition {} to position {}.", tp, position); | ||
| state.seekUnvalidated(position); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -421,11 +420,17 @@ synchronized int numAssignedPartitions() { | |
| return this.assignment.size(); | ||
| } | ||
|
|
||
| synchronized List<TopicPartition> fetchablePartitions(Predicate<TopicPartition> isAvailable) { | ||
| return assignment.stream() | ||
| .filter(tpState -> isAvailable.test(tpState.topicPartition()) && tpState.value().isFetchable()) | ||
| .map(PartitionStates.PartitionState::topicPartition) | ||
| .collect(Collectors.toList()); | ||
| // Visible for testing | ||
| public synchronized List<TopicPartition> fetchablePartitions(Predicate<TopicPartition> isAvailable) { | ||
| // Since this is in the hot-path for fetching, we do this instead of using java.util.stream API | ||
| List<TopicPartition> result = new ArrayList<>(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should add a small comment that this is in the hotpath and is written the "ugly" way for a reason. It's also probably worth mentioning that we do the cheap isFetchable check first. |
||
| assignment.forEach((topicPartition, topicPartitionState) -> { | ||
| // Cheap check is first to avoid evaluating the predicate if possible | ||
| if (topicPartitionState.isFetchable() && isAvailable.test(topicPartition)) { | ||
| result.add(topicPartition); | ||
| } | ||
| }); | ||
| return result; | ||
| } | ||
|
|
||
| public synchronized boolean hasAutoAssignedPartitions() { | ||
|
|
@@ -596,10 +601,9 @@ public synchronized Optional<Integer> clearPreferredReadReplica(TopicPartition t | |
|
|
||
| public synchronized Map<TopicPartition, OffsetAndMetadata> allConsumed() { | ||
| Map<TopicPartition, OffsetAndMetadata> allConsumed = new HashMap<>(); | ||
| assignment.stream().forEach(state -> { | ||
| TopicPartitionState partitionState = state.value(); | ||
| assignment.forEach((topicPartition, partitionState) -> { | ||
| if (partitionState.hasValidPosition()) | ||
| allConsumed.put(state.topicPartition(), new OffsetAndMetadata(partitionState.position.offset, | ||
| allConsumed.put(topicPartition, new OffsetAndMetadata(partitionState.position.offset, | ||
| partitionState.position.offsetEpoch, "")); | ||
| }); | ||
| return allConsumed; | ||
|
|
@@ -639,26 +643,34 @@ public synchronized OffsetResetStrategy resetStrategy(TopicPartition partition) | |
| } | ||
|
|
||
| public synchronized boolean hasAllFetchPositions() { | ||
| return assignment.stream().allMatch(state -> state.value().hasValidPosition()); | ||
| // Since this is in the hot-path for fetching, we do this instead of using java.util.stream API | ||
| Iterator<TopicPartitionState> it = assignment.stateIterator(); | ||
| while (it.hasNext()) { | ||
| if (!it.next().hasValidPosition()) { | ||
| return false; | ||
| } | ||
| } | ||
| return true; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mumrah I now understand why the previous version was slower, it was allocating a PartitionState instance per element. But we only use the value here. So, we could still use |
||
| } | ||
|
|
||
| public synchronized Set<TopicPartition> initializingPartitions() { | ||
| return collectPartitions(state -> state.fetchState.equals(FetchStates.INITIALIZING), Collectors.toSet()); | ||
| return collectPartitions(state -> state.fetchState.equals(FetchStates.INITIALIZING)); | ||
| } | ||
|
|
||
| private <T extends Collection<TopicPartition>> T collectPartitions(Predicate<TopicPartitionState> filter, Collector<TopicPartition, ?, T> collector) { | ||
| return assignment.stream() | ||
| .filter(state -> filter.test(state.value())) | ||
| .map(PartitionStates.PartitionState::topicPartition) | ||
| .collect(collector); | ||
| private Set<TopicPartition> collectPartitions(Predicate<TopicPartitionState> filter) { | ||
| Set<TopicPartition> result = new HashSet<>(); | ||
| assignment.forEach((topicPartition, topicPartitionState) -> { | ||
| if (filter.test(topicPartitionState)) { | ||
| result.add(topicPartition); | ||
| } | ||
| }); | ||
| return result; | ||
| } | ||
|
|
||
|
|
||
| public synchronized void resetInitializingPositions() { | ||
| final Set<TopicPartition> partitionsWithNoOffsets = new HashSet<>(); | ||
| assignment.stream().forEach(state -> { | ||
| TopicPartition tp = state.topicPartition(); | ||
| TopicPartitionState partitionState = state.value(); | ||
| assignment.forEach((tp, partitionState) -> { | ||
| if (partitionState.fetchState.equals(FetchStates.INITIALIZING)) { | ||
| if (defaultResetStrategy == OffsetResetStrategy.NONE) | ||
| partitionsWithNoOffsets.add(tp); | ||
|
|
@@ -672,13 +684,11 @@ public synchronized void resetInitializingPositions() { | |
| } | ||
|
|
||
| public synchronized Set<TopicPartition> partitionsNeedingReset(long nowMs) { | ||
| return collectPartitions(state -> state.awaitingReset() && !state.awaitingRetryBackoff(nowMs), | ||
| Collectors.toSet()); | ||
| return collectPartitions(state -> state.awaitingReset() && !state.awaitingRetryBackoff(nowMs)); | ||
| } | ||
|
|
||
| public synchronized Set<TopicPartition> partitionsNeedingValidation(long nowMs) { | ||
| return collectPartitions(state -> state.awaitingValidation() && !state.awaitingRetryBackoff(nowMs), | ||
| Collectors.toSet()); | ||
| return collectPartitions(state -> state.awaitingValidation() && !state.awaitingRetryBackoff(nowMs)); | ||
| } | ||
|
|
||
| public synchronized boolean isAssigned(TopicPartition tp) { | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.