Skip to content

Commit

Permalink
dnsdist: Fix exponential backoff computation in edge cases
Browse files Browse the repository at this point in the history
  • Loading branch information
rgacogne committed Feb 27, 2024
1 parent 4d235bf commit 77ef4e0
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 7 deletions.
7 changes: 4 additions & 3 deletions pdns/dnsdistdist/dnsdist-backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -692,6 +692,7 @@ bool DownstreamState::healthCheckRequired(std::optional<time_t> currentTime)
lastResults.clear();
vinfolog("Backend %s reached the lazy health-check threshold (%f%% out of %f%%, looking at sample of %d items with %d failures), moving to Potential Failure state", getNameWithAddr(), current, maxFailureRate, totalCount, failures);
stats->d_status = LazyHealthCheckStats::LazyStatus::PotentialFailure;
consecutiveSuccessfulChecks = 0;
/* we update the next check time here because the check might time out,
and we do not want to send a second check during that time unless
the timer is actually very short */
Expand Down Expand Up @@ -751,7 +752,7 @@ void DownstreamState::updateNextLazyHealthCheck(LazyHealthCheckStats& stats, boo

time_t backOff = d_config.d_lazyHealthCheckMaxBackOff;
const ExponentialBackOffTimer backOffTimer(d_config.d_lazyHealthCheckMaxBackOff);
auto backOffCoeffTmp = backOffTimer.get(failedTests);
auto backOffCoeffTmp = backOffTimer.get(failedTests - 1);
/* backOffCoeffTmp cannot be higher than d_config.d_lazyHealthCheckMaxBackOff */
const auto backOffCoeff = static_cast<time_t>(backOffCoeffTmp);
if ((std::numeric_limits<time_t>::max() / d_config.d_lazyHealthCheckFailedInterval) >= backOffCoeff) {
Expand Down Expand Up @@ -800,12 +801,12 @@ void DownstreamState::submitHealthCheckResult(bool initial, bool newResult)
if (newResult) {
/* check succeeded */
currentCheckFailures = 0;
consecutiveSuccessfulChecks++;

if (!upStatus) {
/* we were previously marked as "down" and had a successful health-check,
let's see if this is enough to move to the "up" state or if we need
more successful health-checks for that */
consecutiveSuccessfulChecks++;
if (consecutiveSuccessfulChecks < d_config.minRiseSuccesses) {
/* we need more than one successful check to rise
and we didn't reach the threshold yet, let's stay down */
Expand Down Expand Up @@ -846,7 +847,7 @@ void DownstreamState::submitHealthCheckResult(bool initial, bool newResult)
auto stats = d_lazyHealthCheckStats.lock();
vinfolog("Backend %s failed its health-check, moving from Potential failure to Failed", getNameWithAddr());
stats->d_status = LazyHealthCheckStats::LazyStatus::Failed;
currentCheckFailures = 0;
currentCheckFailures = 1;
updateNextLazyHealthCheck(*stats, false);
}
}
Expand Down
8 changes: 4 additions & 4 deletions pdns/dnsdistdist/test-dnsdistbackend_cc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -264,8 +264,8 @@ BOOST_AUTO_TEST_CASE(test_LazyExponentialBackOff)
BOOST_CHECK_EQUAL(ds.getStatus(), "down");
BOOST_CHECK_EQUAL(ds.healthCheckRequired(currentTime), false);
/* and the wait time between two checks will double every time a failure occurs */
BOOST_CHECK_EQUAL(ds.getNextLazyHealthCheck(), (currentTime + (config.d_lazyHealthCheckFailedInterval * std::pow(2U, ds.currentCheckFailures))));
BOOST_CHECK_EQUAL(ds.currentCheckFailures, 0U);
BOOST_CHECK_EQUAL(ds.getNextLazyHealthCheck(), (currentTime + (config.d_lazyHealthCheckFailedInterval * std::pow(2U, ds.currentCheckFailures - 1))));
BOOST_CHECK_EQUAL(ds.currentCheckFailures, 1U);

/* so after 5 failures */
const size_t nbFailures = 5;
Expand All @@ -274,8 +274,8 @@ BOOST_AUTO_TEST_CASE(test_LazyExponentialBackOff)
BOOST_CHECK(ds.healthCheckRequired(currentTime));
ds.submitHealthCheckResult(false, false);
}
BOOST_CHECK_EQUAL(ds.currentCheckFailures, nbFailures);
BOOST_CHECK_EQUAL(ds.getNextLazyHealthCheck(), (currentTime + (config.d_lazyHealthCheckFailedInterval * std::pow(2U, ds.currentCheckFailures))));
BOOST_CHECK_EQUAL(ds.currentCheckFailures, nbFailures + 1);
BOOST_CHECK_EQUAL(ds.getNextLazyHealthCheck(), (currentTime + (config.d_lazyHealthCheckFailedInterval * std::pow(2U, ds.currentCheckFailures - 1))));

/* we need minRiseSuccesses successful health-checks to go up */
BOOST_REQUIRE(config.minRiseSuccesses >= 1);
Expand Down

0 comments on commit 77ef4e0

Please sign in to comment.