Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Commit

Permalink
Don't reset retry timers on "valid" error codes (#16221)
Browse files Browse the repository at this point in the history
  • Loading branch information
erikjohnston authored Sep 4, 2023
1 parent 748c389 commit f84baec
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 5 deletions.
1 change: 1 addition & 0 deletions changelog.d/16221.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix long-standing bug where we did not correctly back off from servers that had "gone" if they returned 4xx series error codes.
4 changes: 3 additions & 1 deletion synapse/federation/transport/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,8 +249,10 @@ async def send_transaction(
data=json_data,
json_data_callback=json_data_callback,
long_retries=True,
backoff_on_404=True, # If we get a 404 the other side has gone
try_trailing_slash_on_400=True,
# Sending a transaction should always succeed, if it doesn't
# then something is wrong and we should backoff.
backoff_on_all_error_codes=True,
)

async def make_query(
Expand Down
8 changes: 8 additions & 0 deletions synapse/http/matrixfederationclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,7 @@ async def _send_request(
long_retries: bool = False,
ignore_backoff: bool = False,
backoff_on_404: bool = False,
backoff_on_all_error_codes: bool = False,
) -> IResponse:
"""
Sends a request to the given server.
Expand Down Expand Up @@ -552,6 +553,7 @@ async def _send_request(
and try the request anyway.
backoff_on_404: Back off if we get a 404
backoff_on_all_error_codes: Back off if we get any error response
Returns:
Resolves with the HTTP response object on success.
Expand Down Expand Up @@ -594,6 +596,7 @@ async def _send_request(
ignore_backoff=ignore_backoff,
notifier=self.hs.get_notifier(),
replication_client=self.hs.get_replication_command_handler(),
backoff_on_all_error_codes=backoff_on_all_error_codes,
)

method_bytes = request.method.encode("ascii")
Expand Down Expand Up @@ -889,6 +892,7 @@ async def put_json(
backoff_on_404: bool = False,
try_trailing_slash_on_400: bool = False,
parser: Literal[None] = None,
backoff_on_all_error_codes: bool = False,
) -> JsonDict:
...

Expand All @@ -906,6 +910,7 @@ async def put_json(
backoff_on_404: bool = False,
try_trailing_slash_on_400: bool = False,
parser: Optional[ByteParser[T]] = None,
backoff_on_all_error_codes: bool = False,
) -> T:
...

Expand All @@ -922,6 +927,7 @@ async def put_json(
backoff_on_404: bool = False,
try_trailing_slash_on_400: bool = False,
parser: Optional[ByteParser[T]] = None,
backoff_on_all_error_codes: bool = False,
) -> Union[JsonDict, T]:
"""Sends the specified json data using PUT
Expand Down Expand Up @@ -957,6 +963,7 @@ async def put_json(
enabled.
parser: The parser to use to decode the response. Defaults to
parsing as JSON.
backoff_on_all_error_codes: Back off if we get any error response
Returns:
Succeeds when we get a 2xx HTTP response. The
Expand Down Expand Up @@ -990,6 +997,7 @@ async def put_json(
ignore_backoff=ignore_backoff,
long_retries=long_retries,
timeout=timeout,
backoff_on_all_error_codes=backoff_on_all_error_codes,
)

if timeout is not None:
Expand Down
18 changes: 16 additions & 2 deletions synapse/util/retryutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def __init__(
backoff_on_failure: bool = True,
notifier: Optional["Notifier"] = None,
replication_client: Optional["ReplicationCommandHandler"] = None,
backoff_on_all_error_codes: bool = False,
):
"""Marks the destination as "down" if an exception is thrown in the
context, except for CodeMessageException with code < 500.
Expand All @@ -147,6 +148,9 @@ def __init__(
backoff_on_failure: set to False if we should not increase the
retry interval on a failure.
backoff_on_all_error_codes: Whether we should back off on any
error code.
"""
self.clock = clock
self.store = store
Expand All @@ -156,6 +160,7 @@ def __init__(
self.retry_interval = retry_interval
self.backoff_on_404 = backoff_on_404
self.backoff_on_failure = backoff_on_failure
self.backoff_on_all_error_codes = backoff_on_all_error_codes

self.notifier = notifier
self.replication_client = replication_client
Expand All @@ -179,6 +184,7 @@ def __exit__(
exc_val: Optional[BaseException],
exc_tb: Optional[TracebackType],
) -> None:
success = exc_type is None
valid_err_code = False
if exc_type is None:
valid_err_code = True
Expand All @@ -195,7 +201,9 @@ def __exit__(
# won't accept our requests for at least a while.
# 429 is us being aggressively rate limited, so lets rate limit
# ourselves.
if exc_val.code == 404 and self.backoff_on_404:
if self.backoff_on_all_error_codes:
valid_err_code = False
elif exc_val.code == 404 and self.backoff_on_404:
valid_err_code = False
elif exc_val.code in (401, 429):
valid_err_code = False
Expand All @@ -204,7 +212,7 @@ def __exit__(
else:
valid_err_code = False

if valid_err_code:
if success:
# We connected successfully.
if not self.retry_interval:
return
Expand All @@ -215,6 +223,12 @@ def __exit__(
self.failure_ts = None
retry_last_ts = 0
self.retry_interval = 0
elif valid_err_code:
# We got a potentially valid error code back. We don't reset the
# timers though, as the other side might actually be down anyway
# (e.g. some deprovisioned servers will always return a 404 or 403,
# and we don't want to keep resetting the retry timers for them).
return
elif not self.backoff_on_failure:
return
else:
Expand Down
4 changes: 2 additions & 2 deletions tests/handlers/test_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,8 +251,8 @@ def test_started_typing_remote_send(self) -> None:
),
json_data_callback=ANY,
long_retries=True,
backoff_on_404=True,
try_trailing_slash_on_400=True,
backoff_on_all_error_codes=True,
)

def test_started_typing_remote_recv(self) -> None:
Expand Down Expand Up @@ -366,7 +366,7 @@ def test_stopped_typing(self) -> None:
),
json_data_callback=ANY,
long_retries=True,
backoff_on_404=True,
backoff_on_all_error_codes=True,
try_trailing_slash_on_400=True,
)

Expand Down

0 comments on commit f84baec

Please sign in to comment.