Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

router: add API to retry reset before request #35074

Merged
merged 10 commits into from
Jul 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions changelogs/current.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,9 @@ new_features:
<envoy_v3_api_field_extensions.transport_sockets.tls.v3.CertificateValidationContext.match_typed_subject_alt_names>`.
An additional field ``oid`` is added to :ref:`SubjectAltNameMatcher
<envoy_v3_api_msg_extensions.transport_sockets.tls.v3.SubjectAltNameMatcher>` to support this change.
- area: retry
change: |
Added :ref:`reset-before-request retry policy <config_http_filters_router_x-envoy-retry-on>`.
- area: ext_proc
change: |
Added support for observability mode which deprecates ``async_mode``. If enabled, each part of the HTTP request or response
Expand Down
4 changes: 4 additions & 0 deletions docs/root/configuration/http/http_filters/router_filter.rst
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ gateway-error
reset
Envoy will attempt a retry if the upstream server does not respond at all (disconnect/reset/read timeout.)

reset-before-request
Equivalent to *reset* but will only retry requests that have not been sent to the upstream server
(i.e. the headers have not been sent).

connect-failure
Envoy will attempt a retry if a request is failed because of a connection failure to the upstream
server (connect timeout, etc.). (Included in *5xx*)
Expand Down
2 changes: 1 addition & 1 deletion docs/root/faq/load_balancing/transient_failures.rst
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ of times the host has been ejected).
.. code-block:: json

{
"retry_on": "cancelled,connect-failure,gateway-error,refused-stream,reset,resource-exhausted,unavailable",
"retry_on": "cancelled,connect-failure,gateway-error,refused-stream,reset,reset-before-request,resource-exhausted,unavailable",
"num_retries": 1,
"retry_host_predicate": [
{
Expand Down
9 changes: 7 additions & 2 deletions envoy/router/router.h
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ class RetryPolicy {
static constexpr uint32_t RETRY_ON_RETRIABLE_HEADERS = 0x1000;
static constexpr uint32_t RETRY_ON_ENVOY_RATE_LIMITED = 0x2000;
static constexpr uint32_t RETRY_ON_HTTP3_POST_CONNECT_FAILURE = 0x4000;
static constexpr uint32_t RETRY_ON_RESET_BEFORE_REQUEST = 0x8000;
// clang-format on

virtual ~RetryPolicy() = default;
Expand Down Expand Up @@ -440,13 +441,17 @@ class RetryState {
* not. nullopt means it wasn't sent at all before getting reset.
* @param callback supplies the callback that will be invoked when the retry should take place.
* This is used to add timed backoff, etc. The callback will never be called
* inline.
* inline.
* @param upstream_request_started indicates whether the first byte has been transmitted to the
* upstream server.
*
* @return RetryStatus if a retry should take place. @param callback will be called at some point
* in the future. Otherwise a retry should not take place and the callback will never be
* called. Calling code should proceed with error handling.
*/
virtual RetryStatus shouldRetryReset(Http::StreamResetReason reset_reason, Http3Used http3_used,
DoRetryResetCallback callback) PURE;
DoRetryResetCallback callback,
bool upstream_request_started) PURE;

/**
* Determine whether a "hedged" retry should be sent after the per try
Expand Down
1 change: 1 addition & 0 deletions source/common/http/headers.h
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ class HeaderValues {
const std::string RetriableStatusCodes{"retriable-status-codes"};
const std::string RetriableHeaders{"retriable-headers"};
const std::string Reset{"reset"};
const std::string ResetBeforeRequest{"reset-before-request"};
const std::string Http3PostConnectFailure{"http3-post-connect-failure"};
} EnvoyRetryOnValues;

Expand Down
21 changes: 17 additions & 4 deletions source/common/router/retry_state_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,8 @@ std::pair<uint32_t, bool> RetryStateImpl::parseRetryOn(absl::string_view config)
ret |= RetryPolicy::RETRY_ON_RETRIABLE_HEADERS;
} else if (retry_on == Http::Headers::get().EnvoyRetryOnValues.Reset) {
ret |= RetryPolicy::RETRY_ON_RESET;
} else if (retry_on == Http::Headers::get().EnvoyRetryOnValues.ResetBeforeRequest) {
ret |= RetryPolicy::RETRY_ON_RESET_BEFORE_REQUEST;
} else if (retry_on == Http::Headers::get().EnvoyRetryOnValues.Http3PostConnectFailure) {
ret |= RetryPolicy::RETRY_ON_HTTP3_POST_CONNECT_FAILURE;
} else {
Expand Down Expand Up @@ -356,11 +358,13 @@ RetryStatus RetryStateImpl::shouldRetryHeaders(const Http::ResponseHeaderMap& re
}

RetryStatus RetryStateImpl::shouldRetryReset(Http::StreamResetReason reset_reason,
Http3Used http3_used, DoRetryResetCallback callback) {
Http3Used http3_used, DoRetryResetCallback callback,
bool upstream_request_started) {

// Following wouldRetryFromReset() may override the value.
bool disable_http3 = false;
const RetryDecision retry_decision = wouldRetryFromReset(reset_reason, http3_used, disable_http3);
const RetryDecision retry_decision =
wouldRetryFromReset(reset_reason, http3_used, disable_http3, upstream_request_started);
return shouldRetry(retry_decision, [disable_http3, callback]() { callback(disable_http3); });
}

Expand Down Expand Up @@ -458,7 +462,8 @@ RetryStateImpl::wouldRetryFromHeaders(const Http::ResponseHeaderMap& response_he

RetryState::RetryDecision
RetryStateImpl::wouldRetryFromReset(const Http::StreamResetReason reset_reason,
Http3Used http3_used, bool& disable_http3) {
Http3Used http3_used, bool& disable_http3,
bool upstream_request_started) {
ASSERT(!disable_http3);
// First check "never retry" conditions so we can short circuit (we never
// retry if the reset reason is overflow).
Expand All @@ -477,7 +482,7 @@ RetryStateImpl::wouldRetryFromReset(const Http::StreamResetReason reset_reason,
"0-RTT was attempted on non-Quic connection and failed.");
return RetryDecision::RetryImmediately;
}
if ((retry_on_ & RetryPolicy::RETRY_ON_CONNECT_FAILURE)) {
if (retry_on_ & RetryPolicy::RETRY_ON_CONNECT_FAILURE) {
// This is a pool failure.
return RetryDecision::RetryWithBackoff;
}
Expand All @@ -489,6 +494,14 @@ RetryStateImpl::wouldRetryFromReset(const Http::StreamResetReason reset_reason,
return RetryDecision::RetryImmediately;
}

// Technically, this doesn't *have* to go before the RETRY_ON_RESET check,
// but it's safer for the user if they have them both set
// for some reason.
if (retry_on_ & RetryPolicy::RETRY_ON_RESET_BEFORE_REQUEST && !upstream_request_started) {
// Only return a positive retry decision if we haven't sent any bytes upstream.
return RetryDecision::RetryWithBackoff;
}

if (retry_on_ & RetryPolicy::RETRY_ON_RESET) {
return RetryDecision::RetryWithBackoff;
}
Expand Down
6 changes: 4 additions & 2 deletions source/common/router/retry_state_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ class RetryStateImpl : public RetryState {
const Http::RequestHeaderMap& original_request,
bool& disable_early_data) override;
RetryStatus shouldRetryReset(Http::StreamResetReason reset_reason, Http3Used http3_used,
DoRetryResetCallback callback) override;
DoRetryResetCallback callback,
bool upstream_request_started) override;
RetryStatus shouldHedgeRetryPerTryTimeout(DoRetryCallback callback) override;

void onHostAttempted(Upstream::HostDescriptionConstSharedPtr host) override {
Expand Down Expand Up @@ -112,7 +113,8 @@ class RetryStateImpl : public RetryState {
// disable_http3: populated to tell the caller whether to disable http3 or not when the return
// value indicates retry.
RetryDecision wouldRetryFromReset(const Http::StreamResetReason reset_reason,
Http3Used http3_used, bool& disable_http3);
Http3Used http3_used, bool& disable_http3,
bool upstream_request_started);
RetryStatus shouldRetry(RetryDecision would_retry, DoRetryCallback callback);

const Upstream::ClusterInfo& cluster_;
Expand Down
11 changes: 10 additions & 1 deletion source/common/router/router.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1329,6 +1329,14 @@ bool Filter::maybeRetryReset(Http::StreamResetReason reset_reason,
? RetryState::Http3Used::Yes
: RetryState::Http3Used::No;
}

// If the current request in this router has sent data to the upstream, we consider the request
// started.
upstream_request_started_ |= upstream_request.streamInfo()
.upstreamInfo()
->upstreamTiming()
.first_upstream_tx_byte_sent_.has_value();

const RetryStatus retry_status = retry_state_->shouldRetryReset(
keithmattix marked this conversation as resolved.
Show resolved Hide resolved
reset_reason, was_using_http3,
[this, can_send_early_data = upstream_request.upstreamStreamOptions().can_send_early_data_,
Expand All @@ -1338,7 +1346,8 @@ bool Filter::maybeRetryReset(Http::StreamResetReason reset_reason,
// the original request is retried with the same can_send_early_data setting, it will not be
// sent as early data by the underlying connection pool grid.
doRetry(can_send_early_data, disable_http3 ? false : can_use_http3, is_timeout_retry);
});
},
upstream_request_started_);
if (retry_status == RetryStatus::Yes) {
runRetryOptionsPredicates(upstream_request);
pending_retries_++;
Expand Down
4 changes: 3 additions & 1 deletion source/common/router/router.h
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,8 @@ class Filter : Logger::Loggable<Logger::Id::router>,
: config_(config), stats_(stats), grpc_request_(false), exclude_http_code_stats_(false),
downstream_response_started_(false), downstream_end_stream_(false), is_retry_(false),
request_buffer_overflowed_(false), streaming_shadows_(Runtime::runtimeFeatureEnabled(
"envoy.reloadable_features.streaming_shadow")) {}
"envoy.reloadable_features.streaming_shadow")),
upstream_request_started_(false) {}

~Filter() override;

Expand Down Expand Up @@ -607,6 +608,7 @@ class Filter : Logger::Loggable<Logger::Id::router>,
bool include_timeout_retry_header_in_request_ : 1;
bool request_buffer_overflowed_ : 1;
const bool streaming_shadows_ : 1;
bool upstream_request_started_ : 1;
};

class ProdFilter : public Filter {
Expand Down
5 changes: 3 additions & 2 deletions test/common/http/async_client_impl_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2315,7 +2315,7 @@ TEST_F(AsyncClientImplUnitTest, AsyncStreamImplInitTestWithRetryPolicy) {
const std::string yaml = R"EOF(
per_try_timeout: 30s
num_retries: 10
retry_on: 5xx,gateway-error,connect-failure,reset
retry_on: 5xx,gateway-error,connect-failure,reset,reset-before-request
retry_back_off:
base_interval: 0.01s
max_interval: 30s
Expand All @@ -2327,7 +2327,8 @@ retry_on: 5xx,gateway-error,connect-failure,reset
EXPECT_EQ(route_entry.retryPolicy().numRetries(), 10);
EXPECT_EQ(route_entry.retryPolicy().perTryTimeout(), std::chrono::seconds(30));
EXPECT_EQ(Router::RetryPolicy::RETRY_ON_CONNECT_FAILURE | Router::RetryPolicy::RETRY_ON_5XX |
Router::RetryPolicy::RETRY_ON_GATEWAY_ERROR | Router::RetryPolicy::RETRY_ON_RESET,
Router::RetryPolicy::RETRY_ON_GATEWAY_ERROR | Router::RetryPolicy::RETRY_ON_RESET |
Router::RetryPolicy::RETRY_ON_RESET_BEFORE_REQUEST,
route_entry.retryPolicy().retryOn());

EXPECT_EQ(route_entry.retryPolicy().baseInterval(), std::chrono::milliseconds(10));
Expand Down
Loading