Skip to content

Commit 45de62a

Browse files
poorbarcodesrinath-ctds
authored andcommitted
[fix][broker] One topic can be closed multiple times concurrently (apache#17524)
(cherry picked from commit 93afd89) (cherry picked from commit 620fe9b)
1 parent 90036e1 commit 45de62a

File tree

4 files changed

+233
-28
lines changed

4 files changed

+233
-28
lines changed

pulsar-broker/src/main/java/org/apache/pulsar/broker/service/persistent/PersistentTopic.java

Lines changed: 104 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
import java.util.concurrent.TimeUnit;
4949
import java.util.concurrent.atomic.AtomicBoolean;
5050
import java.util.concurrent.atomic.AtomicLong;
51+
import java.util.concurrent.atomic.AtomicReference;
5152
import java.util.function.BiFunction;
5253
import java.util.stream.Collectors;
5354
import javax.annotation.Nonnull;
@@ -259,6 +260,37 @@ protected TopicStatsHelper initialValue() {
259260
@Getter
260261
private final ExecutorService orderedExecutor;
261262

263+
private volatile CloseFutures closeFutures;
264+
265+
/***
266+
* We use 2 futures to prevent a new closing if there is an in-progress deletion or closing. We make Pulsar return
267+
* the in-progress one when it is called the second time.
268+
*
269+
* The topic closing will be called the below scenarios:
270+
* 1. Calling "pulsar-admin topics unload". Relate to {@link CloseFutures#waitDisconnectClients}.
271+
* 2. Namespace bundle unloading. The unloading topic triggered by unloading namespace bundles will not wait for
272+
* clients disconnect. See {@link CloseFutures#notWaitDisconnectClients}.
273+
*
274+
* The two futures will be setting as the below rule:
275+
* Event: Topic close.
276+
* - If the first one closing is called by "close and not wait for clients disconnect":
277+
* - {@link CloseFutures#waitDisconnectClients} will be initialized as "waiting for clients disconnect".
278+
* - If the first one closing is called by "close and wait for clients disconnect", the two futures will be
279+
* initialized as "waiting for clients disconnect".
280+
* Event: Topic delete.
281+
* the three futures will be initialized as "waiting for clients disconnect".
282+
*/
283+
private class CloseFutures {
284+
private final CompletableFuture<Void> notWaitDisconnectClients;
285+
private final CompletableFuture<Void> waitDisconnectClients;
286+
287+
public CloseFutures(CompletableFuture<Void> waitDisconnectClients,
288+
CompletableFuture<Void> notWaitDisconnectClients) {
289+
this.waitDisconnectClients = waitDisconnectClients;
290+
this.notWaitDisconnectClients = notWaitDisconnectClients;
291+
}
292+
}
293+
262294
private static class TopicStatsHelper {
263295
public double averageMsgSize;
264296
public double aggMsgRateIn;
@@ -1356,8 +1388,10 @@ private CompletableFuture<Void> delete(boolean failIfHasSubscriptions,
13561388
}
13571389

13581390
fenceTopicToCloseOrDelete(); // Avoid clients reconnections while deleting
1391+
// Mark the progress of close to prevent close calling concurrently.
1392+
this.closeFutures = new CloseFutures(new CompletableFuture(), new CompletableFuture());
13591393

1360-
return getBrokerService().getPulsar().getPulsarResources().getNamespaceResources()
1394+
CompletableFuture<Void> res = getBrokerService().getPulsar().getPulsarResources().getNamespaceResources()
13611395
.getPartitionedTopicResources().runWithMarkDeleteAsync(TopicName.get(topic), () -> {
13621396
CompletableFuture<Void> deleteFuture = new CompletableFuture<>();
13631397

@@ -1460,6 +1494,10 @@ public void deleteLedgerComplete(Object ctx) {
14601494
unfenceTopicToResume();
14611495
}
14621496
});
1497+
1498+
FutureUtil.completeAfter(closeFutures.notWaitDisconnectClients, res);
1499+
FutureUtil.completeAfter(closeFutures.waitDisconnectClients, res);
1500+
return res;
14631501
} finally {
14641502
lock.writeLock().unlock();
14651503
}
@@ -1470,6 +1508,11 @@ public CompletableFuture<Void> close() {
14701508
return close(false);
14711509
}
14721510

1511+
private enum CloseTypes {
1512+
notWaitDisconnectClients,
1513+
waitDisconnectClients;
1514+
}
1515+
14731516
/**
14741517
* Close this topic - close all producers and subscriptions associated with this topic.
14751518
*
@@ -1478,19 +1521,32 @@ public CompletableFuture<Void> close() {
14781521
*/
14791522
@Override
14801523
public CompletableFuture<Void> close(boolean closeWithoutWaitingClientDisconnect) {
1481-
CompletableFuture<Void> closeFuture = new CompletableFuture<>();
14821524

1483-
lock.writeLock().lock();
1484-
try {
1525+
CloseTypes closeType;
1526+
if (closeWithoutWaitingClientDisconnect) {
1527+
closeType = CloseTypes.notWaitDisconnectClients;
1528+
} else {
14851529
// closing managed-ledger waits until all producers/consumers/replicators get closed. Sometimes, broker
14861530
// forcefully wants to close managed-ledger without waiting all resources to be closed.
1487-
if (!isClosingOrDeleting || closeWithoutWaitingClientDisconnect) {
1488-
fenceTopicToCloseOrDelete();
1489-
} else {
1490-
log.warn("[{}] Topic is already being closed or deleted", topic);
1491-
closeFuture.completeExceptionally(new TopicFencedException("Topic is already fenced"));
1492-
return closeFuture;
1531+
closeType = CloseTypes.waitDisconnectClients;
1532+
}
1533+
1534+
lock.writeLock().lock();
1535+
try {
1536+
// Return in-progress future if exists.
1537+
if (isClosingOrDeleting) {
1538+
switch (closeType) {
1539+
case notWaitDisconnectClients -> {
1540+
return closeFutures.notWaitDisconnectClients;
1541+
}
1542+
case waitDisconnectClients -> {
1543+
return closeFutures.waitDisconnectClients;
1544+
}
1545+
}
14931546
}
1547+
// No in-progress closing.
1548+
fenceTopicToCloseOrDelete();
1549+
this.closeFutures = new CloseFutures(new CompletableFuture(), new CompletableFuture());
14941550
} finally {
14951551
lock.writeLock().unlock();
14961552
}
@@ -1528,11 +1584,22 @@ public CompletableFuture<Void> close(boolean closeWithoutWaitingClientDisconnect
15281584
}
15291585
}
15301586

1531-
CompletableFuture<Void> clientCloseFuture = closeWithoutWaitingClientDisconnect
1532-
? CompletableFuture.completedFuture(null)
1533-
: FutureUtil.waitForAll(futures);
1587+
CompletableFuture<Void> disconnectClientsInCurrentCall = null;
1588+
AtomicReference<CompletableFuture<Void>> disconnectClientsToCache = new AtomicReference<>();
1589+
switch (closeType) {
1590+
case notWaitDisconnectClients -> {
1591+
disconnectClientsInCurrentCall = CompletableFuture.completedFuture(null);
1592+
disconnectClientsToCache.set(FutureUtil.waitForAll(futures));
1593+
break;
1594+
}
1595+
case waitDisconnectClients -> {
1596+
disconnectClientsInCurrentCall = FutureUtil.waitForAll(futures);
1597+
disconnectClientsToCache.set(disconnectClientsInCurrentCall);
1598+
}
1599+
}
1600+
CompletableFuture<Void> closeFuture = new CompletableFuture<>();
15341601

1535-
clientCloseFuture.thenRun(() -> {
1602+
Runnable closeLedgerAfterCloseClients = () -> {
15361603
// After having disconnected all producers/consumers, close the managed ledger
15371604
ledger.asyncClose(new CloseCallback() {
15381605
@Override
@@ -1547,13 +1614,32 @@ public void closeFailed(ManagedLedgerException exception, Object ctx) {
15471614
disposeTopic(closeFuture);
15481615
}
15491616
}, null);
1550-
}).exceptionally(exception -> {
1617+
};
1618+
disconnectClientsInCurrentCall.thenRun(closeLedgerAfterCloseClients).exceptionally(exception -> {
15511619
log.error("[{}] Error closing topic", topic, exception);
15521620
unfenceTopicToResume();
15531621
closeFuture.completeExceptionally(exception);
15541622
return null;
15551623
});
15561624

1625+
switch (closeType) {
1626+
case notWaitDisconnectClients -> {
1627+
FutureUtil.completeAfter(closeFutures.notWaitDisconnectClients, closeFuture);
1628+
FutureUtil.completeAfterAll(closeFutures.waitDisconnectClients,
1629+
closeFuture.thenCompose(ignore -> disconnectClientsToCache.get().exceptionally(ex -> {
1630+
// Since the managed ledger has been closed, eat the error of clients disconnection.
1631+
log.error("[{}] Closed managed ledger, but disconnect clients failed,"
1632+
+ " this topic will be marked closed", topic, ex);
1633+
return null;
1634+
})));
1635+
break;
1636+
}
1637+
case waitDisconnectClients -> {
1638+
FutureUtil.completeAfter(closeFutures.notWaitDisconnectClients, closeFuture);
1639+
FutureUtil.completeAfterAll(closeFutures.waitDisconnectClients, closeFuture);
1640+
}
1641+
}
1642+
15571643
return closeFuture;
15581644
}
15591645

@@ -1839,10 +1925,10 @@ protected CompletableFuture<Void> addReplicationCluster(String remoteCluster, Ma
18391925
lock.readLock().lock();
18401926
try {
18411927
if (isClosingOrDeleting) {
1842-
// Whether is "transferring" or not, do not create new replicator.
1928+
// Do not create new replicator.
18431929
log.info("[{}] Skip to create replicator because this topic is closing."
1844-
+ " remote cluster: {}. State of transferring : {}",
1845-
topic, remoteCluster, transferring);
1930+
+ " remote cluster: {}.",
1931+
topic, remoteCluster);
18461932
return;
18471933
}
18481934
Replicator replicator = replicators.computeIfAbsent(remoteCluster, r -> {

pulsar-broker/src/test/java/org/apache/pulsar/broker/service/OneWayReplicatorTest.java

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import static org.testng.Assert.assertEquals;
2626
import static org.testng.Assert.assertFalse;
2727
import static org.testng.Assert.assertNotEquals;
28+
import static org.testng.Assert.assertNull;
2829
import static org.testng.Assert.assertTrue;
2930
import static org.testng.Assert.fail;
3031
import com.google.common.collect.Sets;
@@ -232,7 +233,7 @@ public void testTopicCloseWhenInternalProducerCloseErrorOnce() throws Exception
232233
});
233234
}
234235

235-
private void injectMockReplicatorProducerBuilder(
236+
private Runnable injectMockReplicatorProducerBuilder(
236237
BiFunction<ProducerConfigurationData, ProducerImpl, ProducerImpl> producerDecorator)
237238
throws Exception {
238239
String cluster2 = pulsar2.getConfig().getClusterName();
@@ -252,7 +253,8 @@ private void injectMockReplicatorProducerBuilder(
252253
replicationClients = WhiteboxImpl.getInternalState(brokerService, "replicationClients");
253254
PulsarClientImpl internalClient = (PulsarClientImpl) replicationClients.get(cluster2);
254255
PulsarClient spyClient = spy(internalClient);
255-
replicationClients.put(cluster2, spyClient);
256+
assertTrue(replicationClients.remove(cluster2, internalClient));
257+
assertNull(replicationClients.putIfAbsent(cluster2, spyClient));
256258

257259
// Inject producer decorator.
258260
doAnswer(invocation -> {
@@ -281,6 +283,12 @@ private void injectMockReplicatorProducerBuilder(
281283
}).when(spyProducerBuilder).createAsync();
282284
return spyProducerBuilder;
283285
}).when(spyClient).newProducer(any(Schema.class));
286+
287+
// Return a cleanup injection task;
288+
return () -> {
289+
assertTrue(replicationClients.remove(cluster2, spyClient));
290+
assertNull(replicationClients.putIfAbsent(cluster2, internalClient));
291+
};
284292
}
285293

286294
private SpyCursor spyCursor(PersistentTopic persistentTopic, String cursorName) throws Exception {
@@ -374,7 +382,7 @@ public void testConcurrencyOfUnloadBundleAndRecreateProducer() throws Exception
374382
// If the retry counter is larger than 6, the next creation will be slow enough to close Replicator.
375383
final AtomicInteger createProducerCounter = new AtomicInteger();
376384
final int failTimes = 6;
377-
injectMockReplicatorProducerBuilder((producerCnf, originalProducer) -> {
385+
Runnable taskToClearInjection = injectMockReplicatorProducerBuilder((producerCnf, originalProducer) -> {
378386
if (topicName.equals(producerCnf.getTopicName())) {
379387
// There is a switch to determine create producer successfully or not.
380388
if (createProducerCounter.incrementAndGet() > failTimes) {
@@ -433,6 +441,7 @@ public void testConcurrencyOfUnloadBundleAndRecreateProducer() throws Exception
433441
});
434442

435443
// cleanup.
444+
taskToClearInjection.run();
436445
cleanupTopics(() -> {
437446
admin1.topics().delete(topicName);
438447
admin2.topics().delete(topicName);
@@ -537,7 +546,7 @@ public void testConcurrencyOfUnloadBundleAndRecreateProducer2() throws Exception
537546
// If the retry counter is larger than 6, the next creation will be slow enough to close Replicator.
538547
final AtomicInteger createProducerCounter = new AtomicInteger();
539548
final int failTimes = 6;
540-
injectMockReplicatorProducerBuilder((producerCnf, originalProducer) -> {
549+
Runnable taskToClearInjection = injectMockReplicatorProducerBuilder((producerCnf, originalProducer) -> {
541550
if (topicName.equals(producerCnf.getTopicName())) {
542551
// There is a switch to determine create producer successfully or not.
543552
if (createProducerCounter.incrementAndGet() > failTimes) {
@@ -599,6 +608,7 @@ public void testConcurrencyOfUnloadBundleAndRecreateProducer2() throws Exception
599608
});
600609

601610
// cleanup.
611+
taskToClearInjection.run();
602612
cleanupTopics(namespaceName, () -> {
603613
admin1.topics().delete(topicName);
604614
admin2.topics().delete(topicName);
@@ -619,8 +629,6 @@ public void testUnFenceTopicToReuse() throws Exception {
619629
final String mockProducerName = UUID.randomUUID().toString();
620630
final org.apache.pulsar.broker.service.Producer mockProducer =
621631
mock(org.apache.pulsar.broker.service.Producer.class);
622-
doAnswer(invocation -> CompletableFuture.failedFuture(new RuntimeException("mocked error")))
623-
.when(mockProducer).disconnect(any());
624632
doAnswer(invocation -> CompletableFuture.failedFuture(new RuntimeException("mocked error")))
625633
.when(mockProducer).disconnect();
626634
PersistentTopic persistentTopic =
@@ -631,7 +639,7 @@ public void testUnFenceTopicToReuse() throws Exception {
631639
GeoPersistentReplicator replicator1 =
632640
(GeoPersistentReplicator) persistentTopic.getReplicators().values().iterator().next();
633641
try {
634-
persistentTopic.close(true, false).join();
642+
persistentTopic.close(false).join();
635643
fail("Expected close fails due to a producer close fails");
636644
} catch (Exception ex) {
637645
log.info("Expected error: {}", ex.getMessage());
@@ -650,8 +658,9 @@ public void testUnFenceTopicToReuse() throws Exception {
650658
assertTrue(replicator2.producer != null && replicator2.producer.isConnected());
651659
});
652660

653-
// cleanup.
661+
// cleanup the injection.
654662
persistentTopic.getProducers().remove(mockProducerName, mockProducer);
663+
// cleanup.
655664
producer1.close();
656665
cleanupTopics(() -> {
657666
admin1.topics().delete(topicName);

0 commit comments

Comments
 (0)