Skip to content

Commit

Permalink
Fix plugin parent_select failover
Browse files Browse the repository at this point in the history
  • Loading branch information
rob05c committed Feb 17, 2022
1 parent e29827f commit c0b3803
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 121 deletions.
153 changes: 43 additions & 110 deletions plugins/experimental/parent_select/parent_select.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,65 +47,9 @@ struct StrategyTxn {
TSNextHopSelectionStrategy *strategy;
void *txn; // void* because the actual type will depend on the strategy.
int request_count;
const char *prev_host; // the actually tried host, used when send_request sets the response_action to be the next thing to try.
size_t prev_host_len;
in_port_t prev_port;
bool prev_is_retry;
bool prev_no_cache;
TSResponseAction prev_ra;
};

int
handle_send_request(TSHttpTxn txnp, StrategyTxn *strategyTxn)
{
TSDebug(PLUGIN_NAME, "handle_send_request calling");
TSDebug(PLUGIN_NAME, "handle_send_request got strategy '%s'", strategyTxn->strategy->name());

auto strategy = strategyTxn->strategy;

// if (strategyTxn->retry_attempts == 0) {
// // just did a DoRemap, which means we need to set the response_action of what to do in the event of failure
// // because a failure might not call read_response (e.g. dns failure)
// strategyTxn->retry_attempts = 1;
// TSHttpTxnReenable(txnp, TS_EVENT_HTTP_CONTINUE);
// return TS_SUCCESS;
// }

// before sending a req, we need to set what to do on failure.
// Because some failures don't call handle_response before getting to HttpTransact::HandleResponse
// (e.g. connection failures)

TSResponseAction ra;
TSHttpTxnResponseActionGet(txnp, &ra);

TSDebug(PLUGIN_NAME, "handle_send_request setting prev %.*s:%d", int(ra.hostname_len), ra.hostname, ra.port);
strategyTxn->prev_host = ra.hostname;
strategyTxn->prev_host_len = ra.hostname_len;
strategyTxn->prev_port = ra.port;
strategyTxn->prev_is_retry = ra.is_retry;
strategyTxn->prev_no_cache = ra.no_cache;

strategy->next(txnp, strategyTxn->txn, ra.hostname, ra.hostname_len, ra.port, &ra.hostname, &ra.hostname_len, &ra.port,
&ra.is_retry, &ra.no_cache);

ra.nextHopExists = ra.hostname_len != 0;
ra.fail = !ra.nextHopExists; // failed is whether to fail and return to the client. failed=false means to retry the parent we set
// in the response_action

// we don't know if it's retryable yet, because we don't have a status. So set it retryable if we have something which could be
// retried. We'll set it retryable per the status in handle_response, and os_dns (which is called on connection failures, and
// always retryable [notwithstanding num_retries]).
ra.responseIsRetryable = ra.nextHopExists;
ra.goDirect = strategy->goDirect();
ra.parentIsProxy = strategy->parentIsProxy();

TSDebug(PLUGIN_NAME, "handle_send_request setting response_action hostname '%.*s' port %d direct %d proxy %d",
int(ra.hostname_len), ra.hostname, ra.port, ra.goDirect, ra.parentIsProxy);
TSHttpTxnResponseActionSet(txnp, &ra);

TSHttpTxnReenable(txnp, TS_EVENT_HTTP_CONTINUE);
return TS_SUCCESS;
}

// mark parents up or down, on failure or successful retry.
void
mark_response(TSHttpTxn txnp, StrategyTxn *strategyTxn, TSHttpStatus status)
Expand All @@ -118,17 +62,14 @@ mark_response(TSHttpTxn txnp, StrategyTxn *strategyTxn, TSHttpStatus status)

TSResponseAction ra;
// if the prev_host isn't null, then that was the actual host we tried which needs to be marked down.
if (strategyTxn->prev_host != nullptr) {
ra.hostname = strategyTxn->prev_host;
ra.hostname_len = strategyTxn->prev_host_len;
ra.port = strategyTxn->prev_port;
ra.is_retry = strategyTxn->prev_is_retry;
ra.no_cache = strategyTxn->prev_no_cache;
if (strategyTxn->prev_ra.hostname_len != 0) {
ra = strategyTxn->prev_ra;
TSDebug(PLUGIN_NAME, "mark_response using prev %.*s:%d", int(ra.hostname_len), ra.hostname, ra.port);
} else {
TSHttpTxnResponseActionGet(txnp, &ra);
TSDebug(PLUGIN_NAME, "mark_response using response_action %.*s:%d", int(ra.hostname_len), ra.hostname, ra.port);
}

if (isFailure && strategy->onFailureMarkParentDown(status)) {
if (ra.hostname == nullptr) {
TSError(
Expand Down Expand Up @@ -159,13 +100,6 @@ handle_read_response(TSHttpTxn txnp, StrategyTxn *strategyTxn)

TSDebug(PLUGIN_NAME, "handle_read_response got strategy '%s'", strategy->name());

// increment request count here, not send_request, because we need to consistently increase with os_dns hooks.
// if we incremented the req count in send_request and not here, that would never be called on DNS failures, but DNS successes
// would call os_dns and also send_request, resulting in dns failures incrementing the count by 1, and dns successes but http
// failures would increment by 2. And successes would increment by 2. Hence, the only consistent way to count requests is on
// read_response and os_dns, and not send_request.
++strategyTxn->request_count;

TSMBuffer resp;
TSMLoc resp_hdr;
if (TS_SUCCESS != TSHttpTxnServerRespGet(txnp, &resp, &resp_hdr)) {
Expand Down Expand Up @@ -193,18 +127,15 @@ handle_read_response(TSHttpTxn txnp, StrategyTxn *strategyTxn)
// Status.
TSResponseAction ra;
TSHttpTxnResponseActionGet(txnp, &ra);
ra.responseIsRetryable = strategy->responseIsRetryable(strategyTxn->request_count, status);
ra.responseIsRetryable = strategy->responseIsRetryable(strategyTxn->request_count - 1, status);
TSHttpTxnResponseActionSet(txnp, &ra);
}

// un-set the "prev" hackery. That only exists for markdown, which we just did.
// The response_action is now the next thing to try, if this was a failure,
// and should now be considered authoritative for everything.
strategyTxn->prev_host = nullptr;
strategyTxn->prev_host_len = 0;
strategyTxn->prev_port = 0;
strategyTxn->prev_is_retry = false;
strategyTxn->prev_no_cache = false;

memset(&strategyTxn->prev_ra, 0, sizeof(TSResponseAction));

TSHandleMLocRelease(resp, TS_NULL_MLOC, resp_hdr);
TSHttpTxnReenable(txnp, TS_EVENT_HTTP_CONTINUE);
Expand All @@ -216,6 +147,28 @@ handle_os_dns(TSHttpTxn txnp, StrategyTxn *strategyTxn)
{
TSDebug(PLUGIN_NAME, "handle_os_dns calling");

const TSServerState server_state = TSHttpTxnServerStateGet(txnp);
TSDebug(PLUGIN_NAME, "handle_os_dns ServerState %d", server_state);

// This is the server that core is trying, which may not be the latest response_action if core hasn't decided to re-read it.
//
// Specifically, core will usually retry the same parent without asking parent/strategies for a new parent, up to records.config
// per_parent_connect_attempts, calling the os_dns hook each time. When that happens, we need to detect it, and not keep getting
// subsequent parents and marking down prior response_action hosts.
const char *core_next_hop_name = TSHttpTxnNextHopNameGet(txnp);
TSDebug(PLUGIN_NAME, "handle_os_dns TxnNextHopName %s", core_next_hop_name);

const sockaddr *server_addr = TSHttpTxnServerAddrGet(txnp);
TSDebug(PLUGIN_NAME, "handle_os_dns TSHttpTxnServerAddrGet %p", server_addr);

const sockaddr *nh_addr = TSHttpTxnNextHopAddrGet(txnp);
TSDebug(PLUGIN_NAME, "handle_os_dns TSHttpTxnNextHopAddrGet %p", nh_addr);

const char *pp_name;
int port;
TSHttpTxnParentProxyGet(txnp, &pp_name, &port);
TSDebug(PLUGIN_NAME, "handle_os_dns TSHttpTxnParentProxyGet %s", pp_name);

++strategyTxn->request_count; // this is called after connection failures. So if we got here, we attempted a request

// This is not called on the first attempt.
Expand All @@ -226,38 +179,30 @@ handle_os_dns(TSHttpTxn txnp, StrategyTxn *strategyTxn)

TSDebug(PLUGIN_NAME, "handle_os_dns got strategy '%s'", strategy->name());

mark_response(txnp, strategyTxn, STATUS_CONNECTION_FAILURE);

// now, we need to figure out, are we the first call after send_response set the response_action as the next-thing-to-try,
// or are we a subsequent call, and need to actually set a new response_action

if (strategyTxn->prev_host != nullptr) {
TSDebug(PLUGIN_NAME, "handle_os_dns had prev, keeping existing response_action and un-setting prev");
// if strategyTxn->prev_host exists, this is the very first call after send_response set the response_action to the next thing
// to try. and no handle_response was called in-between (because it was a connection or dns failure) So keep that, and set
// prev_host=nullptr (so we get a new response_action the next time we're called)
strategyTxn->prev_host = nullptr;
strategyTxn->prev_port = 0;
strategyTxn->prev_is_retry = false;
strategyTxn->prev_no_cache = false;
TSHttpTxnReenable(txnp, TS_EVENT_HTTP_CONTINUE);
return TS_SUCCESS;
if (server_state == TS_SRVSTATE_CONNECTION_ERROR || server_state == TS_SRVSTATE_INACTIVE_TIMEOUT) {
mark_response(txnp, strategyTxn, STATUS_CONNECTION_FAILURE);
}

TSDebug(PLUGIN_NAME, "handle_os_dns had no prev, setting new response_action");

{
TSResponseAction ra;
TSHttpTxnResponseActionGet(txnp, &ra);
strategyTxn->prev_ra = ra;
}

TSResponseAction ra;
memset(&ra, 0, sizeof(TSResponseAction)); // because {0} gives a C++ warning. Ugh.
const char *const exclude_host = nullptr;
const size_t exclude_host_len = 0;
const in_port_t exclude_port = 0;
memset(&ra, 0, sizeof(TSResponseAction));
const char *const exclude_host = strategyTxn->prev_ra.hostname;
const size_t exclude_host_len = strategyTxn->prev_ra.hostname_len;
const in_port_t exclude_port = strategyTxn->prev_ra.port;
strategy->next(txnp, strategyTxn->txn, exclude_host, exclude_host_len, exclude_port, &ra.hostname, &ra.hostname_len, &ra.port,
&ra.is_retry, &ra.no_cache);

ra.fail = ra.hostname == nullptr; // failed is whether to immediately fail and return the client a 502. In this case: whether or
// not we found another parent.
ra.nextHopExists = ra.hostname_len != 0;
ra.responseIsRetryable = strategy->responseIsRetryable(strategyTxn->request_count, STATUS_CONNECTION_FAILURE);
ra.responseIsRetryable = strategy->responseIsRetryable(strategyTxn->request_count - 1, STATUS_CONNECTION_FAILURE);
ra.goDirect = strategy->goDirect();
ra.parentIsProxy = strategy->parentIsProxy();
TSDebug(PLUGIN_NAME, "handle_os_dns setting response_action hostname '%.*s' port %d direct %d proxy %d is_retry %d exists %d",
Expand Down Expand Up @@ -298,14 +243,8 @@ handle_hook(TSCont contp, TSEvent event, void *edata)
TSDebug(PLUGIN_NAME, "handle_hook got strategy '%s'", strategyTxn->strategy->name());

switch (event) {
// case TS_EVENT_HTTP_READ_REQUEST_HDR:
// return handle_read_request(txnp, strategyTxn);
case TS_EVENT_HTTP_SEND_REQUEST_HDR:
return handle_send_request(txnp, strategyTxn);
case TS_EVENT_HTTP_READ_RESPONSE_HDR:
return handle_read_response(txnp, strategyTxn);
// case TS_EVENT_HTTP_SEND_RESPONSE_HDR:
// return handle_send_response(txnp, strategyTxn);
case TS_EVENT_HTTP_OS_DNS:
return handle_os_dns(txnp, strategyTxn);
case TS_EVENT_HTTP_TXN_CLOSE:
Expand Down Expand Up @@ -417,16 +356,10 @@ TSRemapDoRemap(void *ih, TSHttpTxn txnp, TSRemapRequestInfo *rri)
strategyTxn->strategy = strategy;
strategyTxn->txn = strategy->newTxn();
strategyTxn->request_count = 0;
strategyTxn->prev_host = nullptr;
strategyTxn->prev_port = 0;
strategyTxn->prev_is_retry = false;
strategyTxn->prev_no_cache = false;
memset(&strategyTxn->prev_ra, 0, sizeof(TSResponseAction));
TSContDataSet(cont, (void *)strategyTxn);

// TSHttpTxnHookAdd(txnp, TS_HTTP_READ_REQUEST_HDR_HOOK, cont);
TSHttpTxnHookAdd(txnp, TS_HTTP_SEND_REQUEST_HDR_HOOK, cont);
TSHttpTxnHookAdd(txnp, TS_HTTP_READ_RESPONSE_HDR_HOOK, cont);
// TSHttpTxnHookAdd(txnp, TS_HTTP_SEND_RESPONSE_HDR_HOOK, cont);
TSHttpTxnHookAdd(txnp, TS_HTTP_OS_DNS_HOOK, cont);
TSHttpTxnHookAdd(txnp, TS_HTTP_TXN_CLOSE_HOOK, cont);

Expand Down
28 changes: 17 additions & 11 deletions proxy/http/HttpTransact.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1753,11 +1753,21 @@ HttpTransact::HandleApiErrorJump(State *s)
return;
}

// PPDNSLookupAPICall does an API callout, then calls PPDNSLookup
// PPDNSLookupAPICall does an API callout if a plugin set the response_action,
// then calls PPDNSLookup.
// This is to preserve plugin hook calling behavior pre-9, which didn't call
// the TS_HTTP_OS_DNS_HOOK on PPDNSLookup.
// Since response_action is new in 9, only new plugins intentionally setting
// it will have the new behavior of TS_HTTP_OS_DNS_HOOK firing on PPDNSLookup.
void
HttpTransact::PPDNSLookupAPICall(State *s)
{
TRANSACT_RETURN(SM_ACTION_API_OS_DNS, PPDNSLookup);
TxnDebug("http_trans", "[HttpTransact::PPDNSLookupAPICall] response_action.handled %d", s->response_action.handled);
if (!s->response_action.handled) {
TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookup);
} else {
TRANSACT_RETURN(SM_ACTION_API_OS_DNS, PPDNSLookup);
}
}

///////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -1798,11 +1808,7 @@ HttpTransact::PPDNSLookup(State *s)

if (!s->current.server->dst_addr.isValid()) {
if (s->current.request_to == PARENT_PROXY) {
if (!s->response_action.handled) {
TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookup);
} else {
TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookupAPICall);
}
TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookupAPICall);
} else if (s->parent_result.result == PARENT_DIRECT && s->http_config_param->no_dns_forward_to_parent != 1) {
// We ran out of parents but parent configuration allows us to go to Origin Server directly
CallOSDNSLookup(s);
Expand Down Expand Up @@ -2243,7 +2249,7 @@ HttpTransact::LookupSkipOpenServer(State *s)
find_server_and_update_current_info(s);

if (s->current.request_to == PARENT_PROXY) {
TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookup);
TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookupAPICall);
} else if (s->parent_result.result == PARENT_FAIL) {
handle_parent_died(s);
return;
Expand Down Expand Up @@ -2950,7 +2956,7 @@ HttpTransact::HandleCacheOpenReadHit(State *s)
ink_assert(s->pending_work == nullptr);
s->pending_work = issue_revalidate;

TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookup);
TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookupAPICall);
} else if (s->current.request_to == ORIGIN_SERVER) {
return CallOSDNSLookup(s);
} else {
Expand Down Expand Up @@ -3377,7 +3383,7 @@ HttpTransact::HandleCacheOpenReadMiss(State *s)
return CallOSDNSLookup(s);
}
if (s->current.request_to == PARENT_PROXY) {
TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, HttpTransact::PPDNSLookup);
TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, HttpTransact::PPDNSLookupAPICall);
} else {
handle_parent_died(s);
return;
Expand Down Expand Up @@ -3746,7 +3752,7 @@ HttpTransact::handle_response_from_parent(State *s)
switch (next_lookup) {
case PARENT_PROXY:
ink_assert(s->current.request_to == PARENT_PROXY);
TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookup);
TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookupAPICall);
break;
case ORIGIN_SERVER:
// Next lookup is Origin Server, try DNS for Origin Server
Expand Down

0 comments on commit c0b3803

Please sign in to comment.