Don't reset retry timers on "valid" error codes (#16221)

This commit is contained in:
Erik Johnston 2023-09-04 14:04:43 +01:00 committed by GitHub
parent 748c38921c
commit f84baecb6f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 30 additions and 5 deletions

1
changelog.d/16221.bugfix Normal file
View File

@ -0,0 +1 @@
Fix long-standing bug where we did not correctly back off from servers that had "gone" if they returned 4xx series error codes.

View File

@ -249,8 +249,10 @@ class TransportLayerClient:
data=json_data,
json_data_callback=json_data_callback,
long_retries=True,
backoff_on_404=True, # If we get a 404 the other side has gone
try_trailing_slash_on_400=True,
# Sending a transaction should always succeed, if it doesn't
# then something is wrong and we should backoff.
backoff_on_all_error_codes=True,
)
async def make_query(

View File

@ -512,6 +512,7 @@ class MatrixFederationHttpClient:
long_retries: bool = False,
ignore_backoff: bool = False,
backoff_on_404: bool = False,
backoff_on_all_error_codes: bool = False,
) -> IResponse:
"""
Sends a request to the given server.
@ -552,6 +553,7 @@ class MatrixFederationHttpClient:
and try the request anyway.
backoff_on_404: Back off if we get a 404
backoff_on_all_error_codes: Back off if we get any error response
Returns:
Resolves with the HTTP response object on success.
@ -594,6 +596,7 @@ class MatrixFederationHttpClient:
ignore_backoff=ignore_backoff,
notifier=self.hs.get_notifier(),
replication_client=self.hs.get_replication_command_handler(),
backoff_on_all_error_codes=backoff_on_all_error_codes,
)
method_bytes = request.method.encode("ascii")
@ -889,6 +892,7 @@ class MatrixFederationHttpClient:
backoff_on_404: bool = False,
try_trailing_slash_on_400: bool = False,
parser: Literal[None] = None,
backoff_on_all_error_codes: bool = False,
) -> JsonDict:
...
@ -906,6 +910,7 @@ class MatrixFederationHttpClient:
backoff_on_404: bool = False,
try_trailing_slash_on_400: bool = False,
parser: Optional[ByteParser[T]] = None,
backoff_on_all_error_codes: bool = False,
) -> T:
...
@ -922,6 +927,7 @@ class MatrixFederationHttpClient:
backoff_on_404: bool = False,
try_trailing_slash_on_400: bool = False,
parser: Optional[ByteParser[T]] = None,
backoff_on_all_error_codes: bool = False,
) -> Union[JsonDict, T]:
"""Sends the specified json data using PUT
@ -957,6 +963,7 @@ class MatrixFederationHttpClient:
enabled.
parser: The parser to use to decode the response. Defaults to
parsing as JSON.
backoff_on_all_error_codes: Back off if we get any error response
Returns:
Succeeds when we get a 2xx HTTP response. The
@ -990,6 +997,7 @@ class MatrixFederationHttpClient:
ignore_backoff=ignore_backoff,
long_retries=long_retries,
timeout=timeout,
backoff_on_all_error_codes=backoff_on_all_error_codes,
)
if timeout is not None:

View File

@ -128,6 +128,7 @@ class RetryDestinationLimiter:
backoff_on_failure: bool = True,
notifier: Optional["Notifier"] = None,
replication_client: Optional["ReplicationCommandHandler"] = None,
backoff_on_all_error_codes: bool = False,
):
"""Marks the destination as "down" if an exception is thrown in the
context, except for CodeMessageException with code < 500.
@ -147,6 +148,9 @@ class RetryDestinationLimiter:
backoff_on_failure: set to False if we should not increase the
retry interval on a failure.
backoff_on_all_error_codes: Whether we should back off on any
error code.
"""
self.clock = clock
self.store = store
@ -156,6 +160,7 @@ class RetryDestinationLimiter:
self.retry_interval = retry_interval
self.backoff_on_404 = backoff_on_404
self.backoff_on_failure = backoff_on_failure
self.backoff_on_all_error_codes = backoff_on_all_error_codes
self.notifier = notifier
self.replication_client = replication_client
@ -179,6 +184,7 @@ class RetryDestinationLimiter:
exc_val: Optional[BaseException],
exc_tb: Optional[TracebackType],
) -> None:
success = exc_type is None
valid_err_code = False
if exc_type is None:
valid_err_code = True
@ -195,7 +201,9 @@ class RetryDestinationLimiter:
# won't accept our requests for at least a while.
# 429 is us being aggressively rate limited, so lets rate limit
# ourselves.
if exc_val.code == 404 and self.backoff_on_404:
if self.backoff_on_all_error_codes:
valid_err_code = False
elif exc_val.code == 404 and self.backoff_on_404:
valid_err_code = False
elif exc_val.code in (401, 429):
valid_err_code = False
@ -204,7 +212,7 @@ class RetryDestinationLimiter:
else:
valid_err_code = False
if valid_err_code:
if success:
# We connected successfully.
if not self.retry_interval:
return
@ -215,6 +223,12 @@ class RetryDestinationLimiter:
self.failure_ts = None
retry_last_ts = 0
self.retry_interval = 0
elif valid_err_code:
# We got a potentially valid error code back. We don't reset the
# timers though, as the other side might actually be down anyway
# (e.g. some deprovisioned servers will always return a 404 or 403,
# and we don't want to keep resetting the retry timers for them).
return
elif not self.backoff_on_failure:
return
else:

View File

@ -251,8 +251,8 @@ class TypingNotificationsTestCase(unittest.HomeserverTestCase):
),
json_data_callback=ANY,
long_retries=True,
backoff_on_404=True,
try_trailing_slash_on_400=True,
backoff_on_all_error_codes=True,
)
def test_started_typing_remote_recv(self) -> None:
@ -366,7 +366,7 @@ class TypingNotificationsTestCase(unittest.HomeserverTestCase):
),
json_data_callback=ANY,
long_retries=True,
backoff_on_404=True,
backoff_on_all_error_codes=True,
try_trailing_slash_on_400=True,
)