Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[fix][broker] Fix unloadNamespaceBundlesGracefully can be stuck with extensible load manager #23349

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
7e544b7
Add tests to reproduce
BewareMyPower Sep 24, 2024
d7b2d80
Fix Free event cannot be sent
BewareMyPower Sep 24, 2024
590e8dd
Close TopicPoliciesService before unload
BewareMyPower Sep 24, 2024
c4de2c6
Avoid blocking the thread that the topic policies reader is created
BewareMyPower Sep 24, 2024
810bd04
Tombstone the bundle gracefully
BewareMyPower Sep 24, 2024
c6540d9
Fix load data store related issues
BewareMyPower Sep 25, 2024
9b5d390
Cleanup BK for each test
BewareMyPower Sep 25, 2024
95bc705
Restore handleSkippedEvent and revert the change to shouldKeepLeft
BewareMyPower Sep 25, 2024
a9e9893
Fail all the lookup requests after clearOwnerships() is called
BewareMyPower Sep 25, 2024
ca51fda
Add enum state to replace started and disabling
BewareMyPower Sep 26, 2024
15fee17
Reuse the Disabled channel state to skip handling all events
BewareMyPower Sep 26, 2024
774e32d
Revert "Restore handleSkippedEvent and revert the change to shouldKee…
BewareMyPower Sep 26, 2024
0e4ccc7
Don't handle Free events specially in ServiceUnitStateDataConflictRes…
BewareMyPower Sep 26, 2024
8589e8b
Allow getOwnedServiceUnits() in DISABLED state
BewareMyPower Sep 26, 2024
bdff856
Stop load data report tasks in disableBroker()
BewareMyPower Sep 26, 2024
79fa08d
Remove unused field
BewareMyPower Sep 26, 2024
83a0b88
Fix LoadDataStoreTest#testShutdown
BewareMyPower Sep 26, 2024
e0fd852
Fix wrong logs
BewareMyPower Sep 26, 2024
7d31290
Revert changes on closeInternalTopics
BewareMyPower Sep 26, 2024
8068f1d
Skip scheduleCleanup if the state is Disabled
BewareMyPower Sep 26, 2024
09f40ff
Revert Free events handling on waitForCleanups
BewareMyPower Sep 26, 2024
d429420
Add log for possible reason that blocks the waitForCleanups
BewareMyPower Sep 26, 2024
5ad8309
Fix regression that does not swallow the exception from filters
BewareMyPower Sep 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -513,6 +513,9 @@ public CompletableFuture<Void> closeAsync() {
return closeFuture;
}
LOG.info("Closing PulsarService");
if (topicPoliciesService != null) {
topicPoliciesService.close();
}
if (brokerService != null) {
brokerService.unloadNamespaceBundlesGracefully();
}
Expand Down Expand Up @@ -633,10 +636,6 @@ public CompletableFuture<Void> closeAsync() {
transactionBufferClient.close();
}

if (topicPoliciesService != null) {
topicPoliciesService.close();
topicPoliciesService = null;
}

if (client != null) {
client.close();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,14 @@ public class ExtensibleLoadManagerImpl implements ExtensibleLoadManager, BrokerS

private SplitManager splitManager;

volatile boolean started = false;
enum State {
INIT,
RUNNING,
// It's removing visibility of the current broker from other brokers. In this state, it cannot play as a leader
// or follower.
DISABLED,
}
private final AtomicReference<State> state = new AtomicReference<>(State.INIT);

private boolean configuredSystemTopics = false;

Expand Down Expand Up @@ -214,7 +221,7 @@ public CompletableFuture<Set<NamespaceBundle>> getOwnedServiceUnitsAsync() {
}

public Set<NamespaceBundle> getOwnedServiceUnits() {
if (!started) {
if (state.get() == State.INIT) {
log.warn("Failed to get owned service units, load manager is not started.");
return Collections.emptySet();
}
Expand Down Expand Up @@ -344,7 +351,7 @@ public static CompletableFuture<Optional<BrokerLookupData>> getAssignedBrokerLoo

@Override
public void start() throws PulsarServerException {
if (this.started) {
if (state.get() != State.INIT) {
return;
}
try {
Expand Down Expand Up @@ -443,7 +450,9 @@ public void start() throws PulsarServerException {

this.splitScheduler.start();
this.initWaiter.complete(true);
this.started = true;
if (!state.compareAndSet(State.INIT, State.RUNNING)) {
failForUnexpectedState("start");
}
log.info("Started load manager.");
} catch (Throwable e) {
failStarting(e);
Expand Down Expand Up @@ -615,21 +624,17 @@ public CompletableFuture<Optional<String>> selectAsync(ServiceUnitId bundle,
filter.filterAsync(availableBrokerCandidates, bundle, context);
futures.add(future);
}
CompletableFuture<Optional<String>> result = new CompletableFuture<>();
FutureUtil.waitForAll(futures).whenComplete((__, ex) -> {
if (ex != null) {
// TODO: We may need to revisit this error case.
log.error("Failed to filter out brokers when select bundle: {}", bundle, ex);
}
return FutureUtil.waitForAll(futures).exceptionally(e -> {
// TODO: We may need to revisit this error case.
log.error("Failed to filter out brokers when select bundle: {}", bundle, e);
return null;
}).thenApply(__ -> {
if (availableBrokerCandidates.isEmpty()) {
result.complete(Optional.empty());
return;
return Optional.empty();
}
Set<String> candidateBrokers = availableBrokerCandidates.keySet();

result.complete(getBrokerSelectionStrategy().select(candidateBrokers, bundle, context));
return getBrokerSelectionStrategy().select(candidateBrokers, bundle, context);
});
return result;
});
}

Expand Down Expand Up @@ -667,6 +672,9 @@ public CompletableFuture<Void> unloadNamespaceBundleAsync(ServiceUnitId bundle,
boolean force,
long timeout,
TimeUnit timeoutUnit) {
if (state.get() == State.INIT) {
return CompletableFuture.completedFuture(null);
}
if (NamespaceService.isSLAOrHeartbeatNamespace(bundle.getNamespaceObject().toString())) {
log.info("Skip unloading namespace bundle: {}.", bundle);
return CompletableFuture.completedFuture(null);
Expand Down Expand Up @@ -755,24 +763,11 @@ private CompletableFuture<Void> splitAsync(SplitDecision decision,

@Override
public void close() throws PulsarServerException {
if (!this.started) {
if (state.get() == State.INIT) {
return;
}
try {
if (brokerLoadDataReportTask != null) {
brokerLoadDataReportTask.cancel(true);
}

if (topBundlesLoadDataReportTask != null) {
topBundlesLoadDataReportTask.cancel(true);
}

if (monitorTask != null) {
monitorTask.cancel(true);
}

this.brokerLoadDataStore.shutdown();
this.topBundlesLoadDataStore.shutdown();
stopLoadDataReportTasks();
this.unloadScheduler.close();
this.splitScheduler.close();
this.serviceUnitStateTableViewSyncer.close();
Expand All @@ -791,14 +786,36 @@ public void close() throws PulsarServerException {
} catch (Exception e) {
throw new PulsarServerException(e);
} finally {
this.started = false;
state.set(State.INIT);
}
}

}
}
}

private void stopLoadDataReportTasks() {
if (brokerLoadDataReportTask != null) {
brokerLoadDataReportTask.cancel(true);
}
if (topBundlesLoadDataReportTask != null) {
topBundlesLoadDataReportTask.cancel(true);
}
if (monitorTask != null) {
monitorTask.cancel(true);
}
try {
brokerLoadDataStore.shutdown();
} catch (IOException e) {
log.warn("Failed to shutdown brokerLoadDataStore", e);
}
try {
topBundlesLoadDataStore.shutdown();
} catch (IOException e) {
log.warn("Failed to shutdown topBundlesLoadDataStore", e);
}
}

public static boolean isInternalTopic(String topic) {
return INTERNAL_TOPICS.contains(topic)
|| topic.startsWith(TOPIC)
Expand All @@ -814,13 +831,16 @@ synchronized void playLeader() {
boolean becameFollower = false;
while (!Thread.currentThread().isInterrupted()) {
try {
if (!initWaiter.get()) {
if (!initWaiter.get() || disabled()) {
return;
}
if (!serviceUnitStateChannel.isChannelOwner()) {
becameFollower = true;
break;
}
if (disabled()) {
return;
}
// Confirm the system topics have been created or create them if they do not exist.
// If the leader has changed, the new leader need to reset
// the local brokerService.topics (by this topic creations).
Expand All @@ -835,6 +855,11 @@ synchronized void playLeader() {
}
break;
} catch (Throwable e) {
if (disabled()) {
log.warn("The broker:{} failed to set the role but exit because it's disabled",
pulsar.getBrokerId(), e);
return;
}
log.warn("The broker:{} failed to set the role. Retrying {} th ...",
pulsar.getBrokerId(), ++retry, e);
try {
Expand All @@ -846,6 +871,9 @@ synchronized void playLeader() {
}
}
}
if (disabled()) {
return;
}

if (becameFollower) {
log.warn("The broker:{} became follower while initializing leader role.", pulsar.getBrokerId());
Expand All @@ -869,13 +897,16 @@ synchronized void playFollower() {
boolean becameLeader = false;
while (!Thread.currentThread().isInterrupted()) {
try {
if (!initWaiter.get()) {
if (!initWaiter.get() || disabled()) {
return;
}
if (serviceUnitStateChannel.isChannelOwner()) {
becameLeader = true;
break;
}
if (disabled()) {
return;
}
unloadScheduler.close();
serviceUnitStateChannel.cancelOwnershipMonitor();
closeInternalTopics();
Expand All @@ -885,6 +916,11 @@ synchronized void playFollower() {
serviceUnitStateTableViewSyncer.close();
break;
} catch (Throwable e) {
if (disabled()) {
log.warn("The broker:{} failed to set the role but exit because it's disabled",
pulsar.getBrokerId(), e);
return;
}
log.warn("The broker:{} failed to set the role. Retrying {} th ...",
pulsar.getBrokerId(), ++retry, e);
try {
Expand All @@ -896,6 +932,9 @@ synchronized void playFollower() {
}
}
}
if (disabled()) {
return;
}

if (becameLeader) {
log.warn("This broker:{} became leader while initializing follower role.", pulsar.getBrokerId());
Expand Down Expand Up @@ -982,9 +1021,20 @@ protected void monitor() {
}

public void disableBroker() throws Exception {
// TopicDoesNotExistException might be thrown and it's not recoverable. Enable this flag to exit playFollower()
// or playLeader() quickly.
if (!state.compareAndSet(State.RUNNING, State.DISABLED)) {
failForUnexpectedState("disableBroker");
}
stopLoadDataReportTasks();
serviceUnitStateChannel.cleanOwnerships();
leaderElectionService.close();
brokerRegistry.unregister();
leaderElectionService.close();
final var availableBrokers = brokerRegistry.getAvailableBrokersAsync()
.get(conf.getMetadataStoreOperationTimeoutSeconds(), TimeUnit.SECONDS);
if (availableBrokers.isEmpty()) {
close();
}
// Close the internal topics (if owned any) after giving up the possible leader role,
// so that the subsequent lookups could hit the next leader.
closeInternalTopics();
Expand Down Expand Up @@ -1018,4 +1068,16 @@ protected BrokerRegistry createBrokerRegistry(PulsarService pulsar) {
protected ServiceUnitStateChannel createServiceUnitStateChannel(PulsarService pulsar) {
return new ServiceUnitStateChannelImpl(pulsar);
}

private void failForUnexpectedState(String msg) {
throw new IllegalStateException("Failed to " + msg + ", state: " + state.get());
}

boolean running() {
return state.get() == State.RUNNING;
}

private boolean disabled() {
return state.get() == State.DISABLED;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public void start() throws PulsarServerException {
}

public boolean started() {
return loadManager.started && loadManager.getServiceUnitStateChannel().started();
return loadManager.running() && loadManager.getServiceUnitStateChannel().started();
}

@Override
Expand Down
Loading
Loading