Skip to content

Commit 4689ac0

Browse files
committed
This PR updates parent selection to limit the number of simultaneous
transactions that will retry a parent once it's retry_time window has elapsed. The limiting prevents a thundering retry of a parent by hundreds of transactions that could very will hang if the parent is still unreachable following retry_time expiration, 'proxy.config.http.parent_proxy.max_trans_retries' which defaults to 2.
1 parent df01ace commit 4689ac0

File tree

8 files changed

+77
-8
lines changed

8 files changed

+77
-8
lines changed

doc/admin-guide/files/records.config.en.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1194,6 +1194,11 @@ Parent Proxy Configuration
11941194

11951195
The amount of time allowed between connection retries to a parent cache that is unavailable.
11961196

1197+
.. ts:cv:: CONFIG proxy.config.http.parent_proxy.max_trans_retries INT 2
1198+
1199+
Limits the number of simultaneous transactions that may retry a parent once the parents
1200+
``retry_time`` has expired.
1201+
11971202
.. ts:cv:: CONFIG proxy.config.http.parent_proxy.fail_threshold INT 10
11981203
:reloadable:
11991204
:overridable:

mgmt/RecordsConfig.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,8 @@ static const RecordElement RecordsConfig[] =
425425
//# the retry window for the parent to be marked down
426426
{RECT_CONFIG, "proxy.config.http.parent_proxy.fail_threshold", RECD_INT, "10", RECU_DYNAMIC, RR_NULL, RECC_NULL, nullptr, RECA_NULL}
427427
,
428+
{RECT_CONFIG, "proxy.config.http.parent_proxy.max_trans_retries", RECD_INT, "2", RECU_DYNAMIC, RR_NULL, RECC_NULL, nullptr, RECA_NULL}
429+
,
428430
{RECT_CONFIG, "proxy.config.http.parent_proxy.total_connect_attempts", RECD_INT, "4", RECU_DYNAMIC, RR_NULL, RECC_NULL, nullptr, RECA_NULL}
429431
,
430432
{RECT_CONFIG, "proxy.config.http.parent_proxy.per_parent_connect_attempts", RECD_INT, "2", RECU_DYNAMIC, RR_NULL, RECC_NULL, nullptr, RECA_NULL}

proxy/ParentConsistentHash.cc

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -231,9 +231,11 @@ ParentConsistentHash::selectParent(bool first_call, ParentResult *result, Reques
231231
// check if the host is retryable. It's retryable if the retry window has elapsed
232232
// and the global host status is HOST_STATUS_UP
233233
if (pRec && !pRec->available && host_stat == HOST_STATUS_UP) {
234-
Debug("parent_select", "Parent.failedAt = %u, retry = %u, xact_start = %u", (unsigned int)pRec->failedAt,
235-
(unsigned int)retry_time, (unsigned int)request_info->xact_start);
236-
if ((pRec->failedAt + retry_time) < request_info->xact_start) {
234+
Debug("parent_select", "Parent.failedAt = %u, retry = %u, retriers = %d, max_retriers = %u, xact_start = %u",
235+
static_cast<unsigned>(pRec->failedAt), static_cast<unsigned>(retry_time), pRec->retriers, max_retriers,
236+
static_cast<unsigned>(request_info->xact_start));
237+
if (((pRec->failedAt + retry_time) < request_info->xact_start) && (pRec->retriers < max_retriers) &&
238+
ink_atomic_increment(&pRec->retriers, 1) < max_retriers) {
237239
parentRetry = true;
238240
// make sure that the proper state is recorded in the result structure
239241
result->last_parent = pRec->idx;
@@ -389,6 +391,7 @@ ParentConsistentHash::markParentUp(ParentResult *result)
389391

390392
ink_atomic_swap(&pRec->failedAt, static_cast<time_t>(0));
391393
int old_count = ink_atomic_swap(&pRec->failCount, 0);
394+
ink_atomic_swap(&pRec->retriers, 0);
392395

393396
if (old_count > 0) {
394397
Note("http parent proxy %s:%d restored", pRec->hostname, pRec->port);

proxy/ParentRoundRobin.cc

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -156,9 +156,12 @@ ParentRoundRobin::selectParent(bool first_call, ParentResult *result, RequestDat
156156
}
157157
} else {
158158
if ((result->wrap_around) ||
159-
((parents[cur_index].failedAt + retry_time) < request_info->xact_start && host_stat == HOST_STATUS_UP)) {
160-
Debug("parent_select", "Parent[%d].failedAt = %u, retry = %u,xact_start = %" PRId64 " but wrap = %d", cur_index,
161-
(unsigned)parents[cur_index].failedAt, retry_time, (int64_t)request_info->xact_start, result->wrap_around);
159+
((((parents[cur_index].failedAt + retry_time) < request_info->xact_start) && host_stat == HOST_STATUS_UP) &&
160+
(parents[cur_index].retriers < max_retriers && ink_atomic_increment(&parents[cur_index].retriers, 1) < max_retriers))) {
161+
Debug("parent_select",
162+
"Parent[%d].failedAt = %u, retry = %u, retriers = %d, max_retriers = %u, xact_start = %" PRId64 " but wrap = %d",
163+
cur_index, static_cast<unsigned>(parents[cur_index].failedAt), retry_time, parents[cur_index].retriers, max_retriers,
164+
static_cast<int64_t>(request_info->xact_start), result->wrap_around);
162165
// Reuse the parent
163166
parentUp = true;
164167
parentRetry = true;

proxy/ParentSelection.cc

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,7 @@ ParentRecord::ProcessParents(char *val, bool isPrimary)
539539
this->parents[i].name = this->parents[i].hostname;
540540
this->parents[i].available = true;
541541
this->parents[i].weight = weight;
542+
this->parents[i].retriers = 0;
542543
if (tmp3) {
543544
memcpy(this->parents[i].hash_string, tmp3 + 1, strlen(tmp3));
544545
this->parents[i].name = this->parents[i].hash_string;
@@ -554,6 +555,7 @@ ParentRecord::ProcessParents(char *val, bool isPrimary)
554555
this->secondary_parents[i].name = this->secondary_parents[i].hostname;
555556
this->secondary_parents[i].available = true;
556557
this->secondary_parents[i].weight = weight;
558+
this->secondary_parents[i].retriers = 0;
557559
if (tmp3) {
558560
memcpy(this->secondary_parents[i].hash_string, tmp3 + 1, strlen(tmp3));
559561
this->secondary_parents[i].name = this->secondary_parents[i].hash_string;
@@ -1105,6 +1107,11 @@ EXCLUSIVE_REGRESSION_TEST(PARENTSELECTION)(RegressionTest * /* t ATS_UNUSED */,
11051107
params->findParent(request, result, fail_threshold, retry_time); \
11061108
} while (0)
11071109

1110+
#define SET_MAX_RETRIERS(x) \
1111+
do { \
1112+
RecSetRecordInt("proxy.config.http.parent_proxy.max_trans_retries", x, REC_SOURCE_DEFAULT); \
1113+
} while (0)
1114+
11081115
// start tests by marking up all tests hosts that will be marked down
11091116
// as part of testing. This will insure that test hosts are not loaded
11101117
// from records.snap as DOWN due to previous testing.
@@ -1117,6 +1124,8 @@ EXCLUSIVE_REGRESSION_TEST(PARENTSELECTION)(RegressionTest * /* t ATS_UNUSED */,
11171124
_st.setHostStatus("curly", HOST_STATUS_UP, 0, Reason::MANUAL);
11181125

11191126
// Test 1
1127+
SET_MAX_RETRIERS(20);
1128+
RecSetRecordInt("proxy.config.http.parent_proxy.max_trans_retries", 20, REC_SOURCE_DEFAULT);
11201129
tbl[0] = '\0';
11211130
ST(1);
11221131
T("dest_domain=. parent=red:37412,orange:37412,yellow:37412 round_robin=strict\n");
@@ -1831,6 +1840,39 @@ EXCLUSIVE_REGRESSION_TEST(PARENTSELECTION)(RegressionTest * /* t ATS_UNUSED */,
18311840
FP;
18321841
RE(verify(result, PARENT_SPECIFIED, "carol", 80), 211);
18331842

1843+
// max_retriers tests
1844+
SET_MAX_RETRIERS(1);
1845+
1846+
// Test 212
1847+
tbl[0] = '\0';
1848+
ST(212);
1849+
T("dest_domain=mouse.com parent=mickey:80|0.33;minnie:80|0.33;goofy:80|0.33 "
1850+
"round_robin=consistent_hash go_direct=false\n");
1851+
REBUILD;
1852+
REINIT;
1853+
br(request, "i.am.mouse.com");
1854+
FP;
1855+
RE(verify(result, PARENT_SPECIFIED, "goofy", 80), 212);
1856+
1857+
// Test 213
1858+
// markdown goofy and minnie gets chosen.
1859+
ST(213);
1860+
params->markParentDown(result, fail_threshold, retry_time); // marked down goofy
1861+
REINIT;
1862+
br(request, "i.am.mouse.com");
1863+
FP;
1864+
RE(verify(result, PARENT_SPECIFIED, "minnie", 80), 213);
1865+
1866+
// Test 214
1867+
// goofy gets chosen because max_retriers was set to 1
1868+
// and goofy becomes available.
1869+
sleep(params->policy.ParentRetryTime + 2);
1870+
ST(214);
1871+
REINIT;
1872+
br(request, "i.am.mouse.com");
1873+
FP;
1874+
RE(verify(result, PARENT_SPECIFIED, "goofy", 80), 214);
1875+
18341876
delete request;
18351877
delete result;
18361878
delete params;

proxy/ParentSelection.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,13 +112,14 @@ struct SimpleRetryResponseCodes {
112112
struct pRecord : ATSConsistentHashNode {
113113
char hostname[MAXDNAME + 1];
114114
int port;
115-
time_t failedAt;
116-
int failCount;
115+
time_t failedAt = 0;
116+
int failCount = 0;
117117
int32_t upAt;
118118
const char *scheme; // for which parent matches (if any)
119119
int idx;
120120
float weight;
121121
char hash_string[MAXDNAME + 1];
122+
int retriers = 0;
122123
};
123124

124125
typedef ControlMatcher<ParentRecord, ParentResult> P_table;
@@ -332,6 +333,9 @@ struct ParentSelectionPolicy {
332333
class ParentSelectionStrategy
333334
{
334335
public:
336+
int max_retriers = 0;
337+
338+
ParentSelectionStrategy() { REC_ReadConfigInteger(max_retriers, "proxy.config.http.parent_proxy.max_trans_retries"); }
335339
//
336340
// Return the pRecord.
337341
virtual pRecord *getParents(ParentResult *result) = 0;

proxy/ParentSelectionStrategy.cc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,11 @@ ParentSelectionStrategy::markParentDown(ParentResult *result, unsigned int fail_
6565
// must set the count to reflect this
6666
if (result->retry == false) {
6767
new_fail_count = pRec->failCount = 1;
68+
} else {
69+
// this was a retry that failed, decrement the retriers count
70+
if (ink_atomic_decrement(&pRec->retriers, 1) < 0) {
71+
ink_atomic_swap(&pRec->retriers, 0);
72+
}
6873
}
6974

7075
Note("Parent %s marked as down %s:%d", (result->retry) ? "retry" : "initially", pRec->hostname, pRec->port);
@@ -121,6 +126,8 @@ ParentSelectionStrategy::markParentUp(ParentResult *result)
121126

122127
ink_atomic_swap(&pRec->failedAt, static_cast<time_t>(0));
123128
int old_count = ink_atomic_swap(&pRec->failCount, 0);
129+
// a retry succeeded, just reset retriers
130+
ink_atomic_swap(&pRec->retriers, 0);
124131

125132
if (old_count > 0) {
126133
Note("http parent proxy %s:%d restored", pRec->hostname, pRec->port);

proxy/http/HttpTransact.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,9 @@ parentExists(HttpTransact::State *s)
236236
inline static void
237237
nextParent(HttpTransact::State *s)
238238
{
239+
TxnDebug("parent_down", "sm_id[%" PRId64 "] connection to parent %s failed, conn_state: %s, request to origin: %s",
240+
s->state_machine->sm_id, s->parent_result.hostname, HttpDebugNames::get_server_state_name(s->current.state),
241+
s->request_data.get_host());
239242
url_mapping *mp = s->url_map.getMapping();
240243
if (mp && mp->strategy) {
241244
// NextHop only has a findNextHop() function.

0 commit comments

Comments
 (0)