Skip to content

Commit 41872e0

Browse files
committed
[ISSUE #144] node abnormal status detection and recovery
Signed-off-by: zhangyang21 <zhangyang21@xiaomi.com>
1 parent eb1b2e2 commit 41872e0

File tree

3 files changed

+57
-2
lines changed

3 files changed

+57
-2
lines changed

src/main/java/io/openmessaging/storage/dledger/DLedgerLeaderElector.java

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import java.util.ArrayList;
3030
import java.util.Arrays;
3131
import java.util.List;
32+
import java.util.Map;
3233
import java.util.Random;
3334
import java.util.concurrent.CompletableFuture;
3435
import java.util.concurrent.CountDownLatch;
@@ -111,6 +112,13 @@ public CompletableFuture<HeartBeatResponse> handleHeartBeat(HeartBeatRequest req
111112
return CompletableFuture.completedFuture(new HeartBeatResponse().term(memberState.currTerm()).code(DLedgerResponseCode.UNEXPECTED_MEMBER.getCode()));
112113
}
113114

115+
if (memberState.isCandidate() && request.isNeedCheckMemberState()) {
116+
logger.warn("[CHECK_MEMBER_STATE] [HandleHeartBeat] remoteId={} need check member state", request.getLeaderId());
117+
if (request.getTerm() < memberState.currTerm()) {
118+
memberState.recoveryToFollower(request.getTerm(), request.getLeaderId());
119+
}
120+
}
121+
114122
if (request.getTerm() < memberState.currTerm()) {
115123
return CompletableFuture.completedFuture(new HeartBeatResponse().term(memberState.currTerm()).code(DLedgerResponseCode.EXPIRED_TERM.getCode()));
116124
} else if (request.getTerm() == memberState.currTerm()) {
@@ -283,10 +291,12 @@ private void sendHeartbeats(long term, String leaderId) throws Exception {
283291
break;
284292
}
285293

286-
if (x.getCode() == DLedgerResponseCode.NETWORK_ERROR.getCode())
294+
if (x.getCode() == DLedgerResponseCode.NETWORK_ERROR.getCode()) {
287295
memberState.getPeersLiveTable().put(id, Boolean.FALSE);
288-
else
296+
} else {
289297
memberState.getPeersLiveTable().put(id, Boolean.TRUE);
298+
memberState.getPeersTermTable().put(id, x.getTerm());
299+
}
290300

291301
if (memberState.isQuorum(succNum.get())
292302
|| memberState.isQuorum(succNum.get() + notReadyNum.get())) {
@@ -305,6 +315,7 @@ private void sendHeartbeats(long term, String leaderId) throws Exception {
305315
beatLatch.await(heartBeatTimeIntervalMs, TimeUnit.MILLISECONDS);
306316
if (memberState.isQuorum(succNum.get())) {
307317
lastSuccHeartBeatTime = System.currentTimeMillis();
318+
checkPeersTermTable();
308319
} else {
309320
logger.info("[{}] Parse heartbeat responses in cost={} term={} allNum={} succNum={} notReadyNum={} inconsistLeader={} maxTerm={} peerSize={} lastSuccHeartBeatTime={}",
310321
memberState.getSelfId(), DLedgerUtils.elapsed(startHeartbeatTimeMs), term, allNum.get(), succNum.get(), notReadyNum.get(), inconsistLeader.get(), maxTerm.get(), memberState.peerSize(), new Timestamp(lastSuccHeartBeatTime));
@@ -320,6 +331,28 @@ private void sendHeartbeats(long term, String leaderId) throws Exception {
320331
}
321332
}
322333

334+
private void checkPeersTermTable() throws Exception {
335+
if (memberState.getSelfId().equals(memberState.getLeaderId())) {
336+
long leaderTerm = memberState.getPeersTermTable().getOrDefault(memberState.getLeaderId(), -1L);
337+
for (Map.Entry<String, Long> entryTerm : memberState.getPeersTermTable().entrySet()) {
338+
if (entryTerm.getKey().equals(memberState.getSelfId())) {
339+
continue;
340+
}
341+
342+
if (entryTerm.getValue() > leaderTerm) {
343+
HeartBeatRequest heartBeatRequest = new HeartBeatRequest();
344+
heartBeatRequest.setGroup(memberState.getGroup());
345+
heartBeatRequest.setLocalId(memberState.getSelfId());
346+
heartBeatRequest.setRemoteId(memberState.getSelfId());
347+
heartBeatRequest.setLeaderId(memberState.getLeaderId());
348+
heartBeatRequest.setNeedCheckMemberState(true);
349+
heartBeatRequest.setTerm(leaderTerm);
350+
dLedgerRpcService.heartBeat(heartBeatRequest);
351+
}
352+
}
353+
}
354+
}
355+
323356
private void maintainAsLeader() throws Exception {
324357
if (DLedgerUtils.elapsed(lastSendHeartBeatTime) > heartBeatTimeIntervalMs) {
325358
long term;

src/main/java/io/openmessaging/storage/dledger/MemberState.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ public class MemberState {
5252
private long knownMaxTermInGroup = -1;
5353
private Map<String, String> peerMap = new HashMap<>();
5454
private Map<String, Boolean> peersLiveTable = new ConcurrentHashMap<>();
55+
private Map<String, Long> peersTermTable = new HashMap<>();
5556

5657
private volatile String transferee;
5758
private volatile long termToTakeLeadership = -1;
@@ -132,6 +133,7 @@ public synchronized void changeToLeader(long term) {
132133
this.role = LEADER;
133134
this.leaderId = selfId;
134135
peersLiveTable.clear();
136+
peersTermTable.clear();
135137
}
136138

137139
public synchronized void changeToFollower(long term, String leaderId) {
@@ -153,6 +155,13 @@ public synchronized void changeToCandidate(long term) {
153155
transferee = null;
154156
}
155157

158+
public synchronized void recoveryToFollower(long term, String leaderId) {
159+
this.role = FOLLOWER;
160+
this.leaderId = leaderId;
161+
this.currTerm = term;
162+
transferee = null;
163+
}
164+
156165
public String getTransferee() {
157166
return transferee;
158167
}
@@ -226,6 +235,10 @@ public Map<String, Boolean> getPeersLiveTable() {
226235
return peersLiveTable;
227236
}
228237

238+
public Map<String, Long> getPeersTermTable() {
239+
return peersTermTable;
240+
}
241+
229242
//just for test
230243
public void setCurrTermForTest(long term) {
231244
PreConditions.check(term >= currTerm, DLedgerResponseCode.ILLEGAL_MEMBER_STATE);

src/main/java/io/openmessaging/storage/dledger/protocol/HeartBeatRequest.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,13 @@
1818

1919
public class HeartBeatRequest extends RequestOrResponse {
2020

21+
private boolean needCheckMemberState = false;
22+
23+
public boolean isNeedCheckMemberState() {
24+
return needCheckMemberState;
25+
}
26+
27+
public void setNeedCheckMemberState(boolean needCheckMemberState) {
28+
this.needCheckMemberState = needCheckMemberState;
29+
}
2130
}

0 commit comments

Comments
 (0)