Skip to content

Commit cbf5ac0

Browse files
committed
[fix][broker] Avoid being stuck in 30+ seconds when closing the BrokerService
Fixes apache#22569 ### Motivation `BrokerService#closeAsync` calls `unloadNamespaceBundlesGracefully` to unload namespaces gracefully. With extensible load manager, it eventually calls `TableViewLoadDataStoreImpl#validateProducer`: ``` BrokerService#unloadNamespaceBundlesGracefully ExtensibleLoadManagerWrapper#disableBroker ExtensibleLoadManagerImpl#disableBroker ServiceUnitStateChannelImpl#cleanOwnerships ServiceUnitStateChannelImpl#doCleanup TableViewLoadDataStoreImpl#removeAsync TableViewLoadDataStoreImpl#validateProducer ``` In `validateProducer`, if the producer is not connected, it will recreate the producer synchronously. However, since the state of `PulsarService` has already been changed to `Closing`, all connect or lookup requests will fail with `ServiceNotReady`. Then the client will retry until timeout. Besides, the unload operation could also trigger the reconnection because the extensible load manager sends the unload event to the `loadbalancer-service-unit-state` topic. ### Modifications The major fix: Before changing PulsarService's state to `Closing`, call `BrokerService#unloadNamespaceBundlesGracefully` first to make the load manager complete the unload operations first. Minor fixes: - Record the time when `LoadManager#disableBroker` is done. - Don't check if producer is disconnected because the producer could retry if it's disconnected. ### Verifications Add `ExtensibleLoadManagerCloseTest` to verify closing `PulsarService` won't take too much time. Here are some test results locally: ``` 2024-04-24T19:43:38,851 - INFO - [main:ExtensibleLoadManagerCloseTest] - Brokers close time: [3342, 3276, 3310] 2024-04-24T19:44:26,711 - INFO - [main:ExtensibleLoadManagerCloseTest] - Brokers close time: [3357, 3258, 3298] 2024-04-24T19:46:16,791 - INFO - [main:ExtensibleLoadManagerCloseTest] - Brokers close time: [3313, 3257, 3263] 2024-04-24T20:13:05,763 - INFO - [main:ExtensibleLoadManagerCloseTest] - Brokers close time: [3304, 3279, 3299] 2024-04-24T20:13:43,979 - INFO - [main:ExtensibleLoadManagerCloseTest] - Brokers close time: [3343, 3308, 3310] ``` As you can see, each broker takes only about 3 seconds to close due to `OWNERSHIP_CLEAN_UP_CONVERGENCE_DELAY_IN_MILLIS` value added in apache#20315
1 parent 89b201e commit cbf5ac0

File tree

4 files changed

+120
-5
lines changed

4 files changed

+120
-5
lines changed

pulsar-broker/src/main/java/org/apache/pulsar/broker/PulsarService.java

+1
Original file line numberDiff line numberDiff line change
@@ -444,6 +444,7 @@ public CompletableFuture<Void> closeAsync() {
444444
return closeFuture;
445445
}
446446
LOG.info("Closing PulsarService");
447+
brokerService.unloadNamespaceBundlesGracefully();
447448
state = State.Closing;
448449

449450
// close the service in reverse order v.s. in which they are started

pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/extensions/store/TableViewLoadDataStoreImpl.java

+1-5
Original file line numberDiff line numberDiff line change
@@ -161,12 +161,8 @@ public synchronized void init() throws IOException {
161161
}
162162

163163
private void validateProducer() {
164-
if (producer == null || !producer.isConnected()) {
164+
if (producer == null) {
165165
try {
166-
if (producer != null) {
167-
producer.close();
168-
}
169-
producer = null;
170166
startProducer();
171167
log.info("Restarted producer on {}", topic);
172168
} catch (Exception e) {

pulsar-broker/src/main/java/org/apache/pulsar/broker/service/BrokerService.java

+11
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,7 @@ public class BrokerService implements Closeable {
309309
private Set<ManagedLedgerPayloadProcessor> brokerEntryPayloadProcessors;
310310

311311
private final TopicEventsDispatcher topicEventsDispatcher = new TopicEventsDispatcher();
312+
private volatile boolean unloaded = false;
312313

313314
public BrokerService(PulsarService pulsar, EventLoopGroup eventLoopGroup) throws Exception {
314315
this.pulsar = pulsar;
@@ -926,9 +927,13 @@ public void unloadNamespaceBundlesGracefully() {
926927
}
927928

928929
public void unloadNamespaceBundlesGracefully(int maxConcurrentUnload, boolean closeWithoutWaitingClientDisconnect) {
930+
if (unloaded) {
931+
return;
932+
}
929933
try {
930934
log.info("Unloading namespace-bundles...");
931935
// make broker-node unavailable from the cluster
936+
long disableBrokerStartTime = System.nanoTime();
932937
if (pulsar.getLoadManager() != null && pulsar.getLoadManager().get() != null) {
933938
try {
934939
pulsar.getLoadManager().get().disableBroker();
@@ -937,6 +942,10 @@ public void unloadNamespaceBundlesGracefully(int maxConcurrentUnload, boolean cl
937942
// still continue and release bundle ownership as broker's registration node doesn't exist.
938943
}
939944
}
945+
double disableBrokerTimeSeconds =
946+
TimeUnit.NANOSECONDS.toMillis((System.nanoTime() - disableBrokerStartTime))
947+
/ 1000.0;
948+
log.info("Disable broker in load manager completed in {} seconds", disableBrokerTimeSeconds);
940949

941950
// unload all namespace-bundles gracefully
942951
long closeTopicsStartTime = System.nanoTime();
@@ -966,6 +975,8 @@ public void unloadNamespaceBundlesGracefully(int maxConcurrentUnload, boolean cl
966975
}
967976
} catch (Exception e) {
968977
log.error("Failed to disable broker from loadbalancer list {}", e.getMessage(), e);
978+
} finally {
979+
unloaded = true;
969980
}
970981
}
971982

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.pulsar.broker.loadbalance.extensions;
20+
21+
import java.util.ArrayList;
22+
import java.util.Collections;
23+
import java.util.List;
24+
import java.util.Optional;
25+
import lombok.extern.slf4j.Slf4j;
26+
import org.apache.pulsar.broker.PulsarService;
27+
import org.apache.pulsar.broker.ServiceConfiguration;
28+
import org.apache.pulsar.client.admin.PulsarAdmin;
29+
import org.apache.pulsar.client.api.PulsarClient;
30+
import org.apache.pulsar.common.policies.data.ClusterData;
31+
import org.apache.pulsar.common.policies.data.TenantInfo;
32+
import org.apache.pulsar.zookeeper.LocalBookkeeperEnsemble;
33+
import org.testng.Assert;
34+
import org.testng.annotations.AfterClass;
35+
import org.testng.annotations.BeforeClass;
36+
import org.testng.annotations.Test;
37+
38+
@Slf4j
39+
public class ExtensibleLoadManagerCloseTest {
40+
41+
private static final String clusterName = "test";
42+
private final LocalBookkeeperEnsemble bk = new LocalBookkeeperEnsemble(1, 0, () -> 0);
43+
private final List<PulsarService> brokers = new ArrayList<>();
44+
private PulsarAdmin admin;
45+
46+
@BeforeClass(alwaysRun = true)
47+
public void setup() throws Exception {
48+
bk.start();
49+
for (int i = 0; i < 3; i++) {
50+
final var broker = new PulsarService(brokerConfig());
51+
broker.start();
52+
brokers.add(broker);
53+
}
54+
admin = brokers.get(0).getAdminClient();
55+
admin.clusters().createCluster(clusterName, ClusterData.builder().build());
56+
admin.tenants().createTenant("public", TenantInfo.builder()
57+
.allowedClusters(Collections.singleton(clusterName)).build());
58+
admin.namespaces().createNamespace("public/default");
59+
}
60+
61+
62+
@AfterClass(alwaysRun = true, timeOut = 30000)
63+
public void cleanup() throws Exception {
64+
bk.stop();
65+
}
66+
67+
private ServiceConfiguration brokerConfig() {
68+
final var config = new ServiceConfiguration();
69+
config.setClusterName(clusterName);
70+
config.setAdvertisedAddress("localhost");
71+
config.setBrokerServicePort(Optional.of(0));
72+
config.setWebServicePort(Optional.of(0));
73+
config.setMetadataStoreUrl("zk:127.0.0.1:" + bk.getZookeeperPort());
74+
config.setManagedLedgerDefaultWriteQuorum(1);
75+
config.setManagedLedgerDefaultAckQuorum(1);
76+
config.setManagedLedgerDefaultEnsembleSize(1);
77+
config.setDefaultNumberOfNamespaceBundles(16);
78+
config.setLoadBalancerAutoBundleSplitEnabled(false);
79+
config.setLoadManagerClassName(ExtensibleLoadManagerImpl.class.getName());
80+
config.setLoadBalancerDebugModeEnabled(true);
81+
config.setBrokerShutdownTimeoutMs(100);
82+
return config;
83+
}
84+
85+
86+
@Test
87+
public void testCloseAfterLoadingBundles() throws Exception {
88+
final var topic = "test";
89+
admin.topics().createPartitionedTopic(topic, 20);
90+
admin.lookups().lookupPartitionedTopic(topic);
91+
final var client = PulsarClient.builder().serviceUrl(brokers.get(0).getBrokerServiceUrl()).build();
92+
final var producer = client.newProducer().topic(topic).create();
93+
producer.close();
94+
client.close();
95+
96+
final var closeTimeMsList = new ArrayList<Long>();
97+
for (var broker : brokers) {
98+
final var startTimeMs = System.currentTimeMillis();
99+
broker.close();
100+
closeTimeMsList.add(System.currentTimeMillis() - startTimeMs);
101+
}
102+
log.info("Brokers close time: {}", closeTimeMsList);
103+
for (var closeTimeMs : closeTimeMsList) {
104+
Assert.assertTrue(closeTimeMs < 5000L);
105+
}
106+
}
107+
}

0 commit comments

Comments
 (0)