From 19939eb7d4a77ecda14c70c9cec1e7d5a7137276 Mon Sep 17 00:00:00 2001 From: John Rushford Date: Thu, 8 Jul 2021 20:34:34 +0000 Subject: [PATCH] Fixes an issue in ParentSelection where a down parent may not be retried due to retrier's limiting. --- proxy/ParentSelection.h | 27 ++++++++++++++++++++ proxy/ParentSelectionStrategy.cc | 5 ---- proxy/http/HttpTransact.cc | 17 ++++++++++++ proxy/http/remap/NextHopConsistentHash.cc | 1 + proxy/http/remap/NextHopHealthStatus.cc | 26 +++++++++++++++++++ proxy/http/remap/NextHopSelectionStrategy.cc | 6 +++++ proxy/http/remap/NextHopSelectionStrategy.h | 8 +++--- 7 files changed, 82 insertions(+), 8 deletions(-) diff --git a/proxy/ParentSelection.h b/proxy/ParentSelection.h index 82c5b2ea33d..b46113089e8 100644 --- a/proxy/ParentSelection.h +++ b/proxy/ParentSelection.h @@ -120,6 +120,14 @@ struct pRecord : ATSConsistentHashNode { float weight; char hash_string[MAXDNAME + 1]; std::atomic retriers = 0; + + void + retryComplete() + { + if (retriers.fetch_sub(1, std::memory_order_relaxed) < 0) { + retriers = 0; + } + } }; typedef ControlMatcher P_table; @@ -359,6 +367,16 @@ class ParentSelectionStrategy // virtual destructor. virtual ~ParentSelectionStrategy(){}; + + void + retryComplete(ParentResult *result) + { + pRecord *p = getParents(result); + uint32_t n = numParents(result); + if (p != nullptr && result->last_parent < n) { + p[result->last_parent].retryComplete(); + } + } }; class ParentConfigParams : public ConfigInfo @@ -411,6 +429,15 @@ class ParentConfigParams : public ConfigInfo } } + void + retryComplete(ParentResult *result) + { + if (!result->is_api_result()) { + ink_release_assert(result != nullptr); + result->rec->selection_strategy->retryComplete(result); + } + } + P_table *parent_table; ParentRecord *DefaultParent; ParentSelectionPolicy policy; diff --git a/proxy/ParentSelectionStrategy.cc b/proxy/ParentSelectionStrategy.cc index 1c1c3a7e4e7..bc962b89436 100644 --- a/proxy/ParentSelectionStrategy.cc +++ b/proxy/ParentSelectionStrategy.cc @@ -65,11 +65,6 @@ ParentSelectionStrategy::markParentDown(ParentResult *result, unsigned int fail_ // must set the count to reflect this if (result->retry == false) { new_fail_count = pRec->failCount = 1; - } else { - // this was a retry that failed, decrement the retriers count - if ((pRec->retriers--) < 0) { - pRec->retriers = 0; - } } Note("Parent %s marked as down %s:%d", (result->retry) ? "retry" : "initially", pRec->hostname, pRec->port); diff --git a/proxy/http/HttpTransact.cc b/proxy/http/HttpTransact.cc index 6e66aa4589c..1dea8c29034 100644 --- a/proxy/http/HttpTransact.cc +++ b/proxy/http/HttpTransact.cc @@ -281,6 +281,17 @@ nextParent(HttpTransact::State *s) } } +inline static void +retryComplete(HttpTransact::State *s) +{ + url_mapping *mp = s->url_map.getMapping(); + if (mp && mp->strategy) { + mp->strategy->retryComplete(reinterpret_cast(s->state_machine), s->parent_result.hostname, s->parent_result.port); + } else if (s->parent_params) { + s->parent_params->retryComplete(&s->parent_result); + } +} + inline static bool is_localhost(const char *name, int len) { @@ -3634,6 +3645,12 @@ HttpTransact::handle_response_from_parent(State *s) TxnDebug("http_trans", "[handle_response_from_parent] (hrfp)"); HTTP_RELEASE_ASSERT(s->current.server == &s->parent_info); + // if this parent was retried from a markdown, then + // notify that the retry has completed. + if (s->parent_result.retry) { + retryComplete(s); + } + simple_or_unavailable_server_retry(s); s->parent_info.state = s->current.state; diff --git a/proxy/http/remap/NextHopConsistentHash.cc b/proxy/http/remap/NextHopConsistentHash.cc index a8943620c84..728c1657fa6 100644 --- a/proxy/http/remap/NextHopConsistentHash.cc +++ b/proxy/http/remap/NextHopConsistentHash.cc @@ -326,6 +326,7 @@ NextHopConsistentHash::findNextHop(TSHttpTxn txnp, void *ih, time_t now) // check if the host is retryable. It's retryable if the retry window has elapsed _now == 0 ? _now = time(nullptr) : _now = now; if ((pRec->failedAt.load() + retry_time) < static_cast(_now)) { + NH_Debug(NH_DEBUG_TAG, "[%" PRIu64 "] next hop %s, retriers: %d", sm_id, pRec->hostname.c_str(), pRec->retriers.load()); if (pRec->retriers.fetch_add(1, std::memory_order_relaxed) < max_retriers) { nextHopRetry = true; result->last_parent = pRec->host_index; diff --git a/proxy/http/remap/NextHopHealthStatus.cc b/proxy/http/remap/NextHopHealthStatus.cc index 9dad0cb6a4a..b81fc4c049d 100644 --- a/proxy/http/remap/NextHopHealthStatus.cc +++ b/proxy/http/remap/NextHopHealthStatus.cc @@ -149,3 +149,29 @@ NextHopHealthStatus::markNextHop(TSHttpTxn txn, const char *hostname, const int break; } } + +void +NextHopHealthStatus::retryComplete(TSHttpTxn txn, const char *hostname, const int port) +{ + HttpSM *sm = reinterpret_cast(txn); + ParentResult result = sm->t_state.parent_result; + int64_t sm_id = sm->sm_id; + + // make sure we're called back with a result structure for a parent that is being retried. + if (result.result != PARENT_SPECIFIED && !result.retry) { + return; + } + + const std::string host_port = HostRecord::makeHostPort(hostname, port); + auto iter = host_map.find(host_port); + if (iter == host_map.end()) { + NH_Debug(NH_DEBUG_TAG, "[%" PRId64 "] no host named %s found in host_map", sm_id, host_port.c_str()); + return; + } + + std::shared_ptr h = iter->second; + + if (h->retriers.fetch_sub(1, std::memory_order_relaxed) < 0) { + h->retriers = 0; + } +} diff --git a/proxy/http/remap/NextHopSelectionStrategy.cc b/proxy/http/remap/NextHopSelectionStrategy.cc index c57555ee1cc..4bbccbd6dbc 100644 --- a/proxy/http/remap/NextHopSelectionStrategy.cc +++ b/proxy/http/remap/NextHopSelectionStrategy.cc @@ -293,6 +293,12 @@ NextHopSelectionStrategy::responseIsRetryable(int64_t sm_id, HttpTransact::Curre return PARENT_RETRY_NONE; } +void +NextHopSelectionStrategy::retryComplete(TSHttpTxn txnp, const char *hostname, const int port) +{ + return passive_health.retryComplete(txnp, hostname, port); +} + namespace YAML { template <> struct convert { diff --git a/proxy/http/remap/NextHopSelectionStrategy.h b/proxy/http/remap/NextHopSelectionStrategy.h index 0c483a3668b..08dd7c59a22 100644 --- a/proxy/http/remap/NextHopSelectionStrategy.h +++ b/proxy/http/remap/NextHopSelectionStrategy.h @@ -53,6 +53,7 @@ struct NHHealthStatus { virtual bool isNextHopAvailable(TSHttpTxn txn, const char *hostname, const int port, void *ih = nullptr) = 0; virtual void markNextHop(TSHttpTxn txn, const char *hostname, const int port, const NHCmd status, void *ih = nullptr, const time_t now = 0) = 0; + virtual void retryComplete(TSHttpTxn txn, const char *hostname, const int port) = 0; virtual ~NHHealthStatus() {} }; @@ -173,9 +174,7 @@ struct HostRecord : ATSConsistentHashNode { std::lock_guard lock(_mutex); failedAt = time(nullptr); available = false; - if (--retriers < 0) { - retriers = 0; - } + retriers = 0; } } @@ -226,6 +225,7 @@ class NextHopHealthStatus : public NHHealthStatus bool isNextHopAvailable(TSHttpTxn txn, const char *hostname, const int port, void *ih = nullptr) override; void markNextHop(TSHttpTxn txn, const char *hostname, const int port, const NHCmd status, void *ih = nullptr, const time_t now = 0) override; + void retryComplete(TSHttpTxn txn, const char *hostname, const int port) override; NextHopHealthStatus(){}; private: @@ -246,6 +246,8 @@ class NextHopSelectionStrategy virtual ParentRetry_t responseIsRetryable(int64_t sm_id, HttpTransact::CurrentInfo ¤t_info, HTTPStatus response_code); + void retryComplete(TSHttpTxn txn, const char *hostname, const int port); + std::string strategy_name; bool go_direct = true; bool parent_is_proxy = true;