From b830982b908249d0180e6aa3810742766b4974b7 Mon Sep 17 00:00:00 2001 From: heyuchen Date: Fri, 28 Dec 2018 09:00:59 +0800 Subject: [PATCH 1/3] fd: fix failure detection bug --- src/dist/failure_detector/failure_detector.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/dist/failure_detector/failure_detector.cpp b/src/dist/failure_detector/failure_detector.cpp index 6942d52652..1b5e38c969 100644 --- a/src/dist/failure_detector/failure_detector.cpp +++ b/src/dist/failure_detector/failure_detector.cpp @@ -397,6 +397,18 @@ bool failure_detector::end_ping_internal(::dsn::error_code err, const beacon_ack return true; } + // TODO(hyc): try to fix 189 + // when ack is not master, it should not modify last_send_time_for_beacon_with_ack, otherwise + // worker may not suside before master consider it disconnected, which may cause two primary replica bug + // but 10s may be not enough to switch meta and connect zk, so that may add unavailable time during rolling-update + // so we should length lease time to avoid possible no-meta situation + if (!ack.is_master) { + dwarn("node[%s] is not master, ack.primary_node[%s] is real master", + node.to_string(), + ack.primary_node.to_string()); + return true; + } + // update last_send_time_for_beacon_with_ack record.last_send_time_for_beacon_with_ack = beacon_send_time; record.rejected = false; From 229551f50823cccb8a955b74d6414d99004c59c1 Mon Sep 17 00:00:00 2001 From: heyuchen Date: Mon, 7 Jan 2019 10:55:41 +0800 Subject: [PATCH 2/3] fd: fix comments --- src/dist/failure_detector/failure_detector.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/dist/failure_detector/failure_detector.cpp b/src/dist/failure_detector/failure_detector.cpp index 1b5e38c969..0d9ed8579c 100644 --- a/src/dist/failure_detector/failure_detector.cpp +++ b/src/dist/failure_detector/failure_detector.cpp @@ -397,11 +397,10 @@ bool failure_detector::end_ping_internal(::dsn::error_code err, const beacon_ack return true; } - // TODO(hyc): try to fix 189 // when ack is not master, it should not modify last_send_time_for_beacon_with_ack, otherwise - // worker may not suside before master consider it disconnected, which may cause two primary replica bug - // but 10s may be not enough to switch meta and connect zk, so that may add unavailable time during rolling-update - // so we should length lease time to avoid possible no-meta situation + // worker may not suside before master consider it disconnected, which may cause two primary + // replica, but 10s may be not enough to switch meta and connect zk, so we should length lease + // time to avoid possible no-meta situation if (!ack.is_master) { dwarn("node[%s] is not master, ack.primary_node[%s] is real master", node.to_string(), From 0588800ba8f441444149dca312ec1ab40eee0efb Mon Sep 17 00:00:00 2001 From: heyuchen Date: Mon, 7 Jan 2019 14:29:43 +0800 Subject: [PATCH 3/3] fd: fix comments --- src/dist/failure_detector/failure_detector.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/dist/failure_detector/failure_detector.cpp b/src/dist/failure_detector/failure_detector.cpp index 0d9ed8579c..a7514723b2 100644 --- a/src/dist/failure_detector/failure_detector.cpp +++ b/src/dist/failure_detector/failure_detector.cpp @@ -397,10 +397,7 @@ bool failure_detector::end_ping_internal(::dsn::error_code err, const beacon_ack return true; } - // when ack is not master, it should not modify last_send_time_for_beacon_with_ack, otherwise - // worker may not suside before master consider it disconnected, which may cause two primary - // replica, but 10s may be not enough to switch meta and connect zk, so we should length lease - // time to avoid possible no-meta situation + // if ack is not from master meta, worker should not update its last send time if (!ack.is_master) { dwarn("node[%s] is not master, ack.primary_node[%s] is real master", node.to_string(),