Avoid rapidly backing-off a server if we ignore the retry interval

This commit is contained in:
Richard van der Hoff 2019-06-03 23:50:43 +01:00
parent fec2dcb1a5
commit dce6e9e0c1
2 changed files with 38 additions and 23 deletions

1
changelog.d/5335.bugfix Normal file
View File

@ -0,0 +1 @@
Fix a bug where we could rapidly mark a server as unreachable even though it was only down for a few minutes.

View File

@ -46,8 +46,7 @@ class NotRetryingDestination(Exception):
@defer.inlineCallbacks @defer.inlineCallbacks
def get_retry_limiter(destination, clock, store, ignore_backoff=False, def get_retry_limiter(destination, clock, store, ignore_backoff=False, **kwargs):
**kwargs):
"""For a given destination check if we have previously failed to """For a given destination check if we have previously failed to
send a request there and are waiting before retrying the destination. send a request there and are waiting before retrying the destination.
If we are not ready to retry the destination, this will raise a If we are not ready to retry the destination, this will raise a
@ -60,8 +59,7 @@ def get_retry_limiter(destination, clock, store, ignore_backoff=False,
clock (synapse.util.clock): timing source clock (synapse.util.clock): timing source
store (synapse.storage.transactions.TransactionStore): datastore store (synapse.storage.transactions.TransactionStore): datastore
ignore_backoff (bool): true to ignore the historical backoff data and ignore_backoff (bool): true to ignore the historical backoff data and
try the request anyway. We will still update the next try the request anyway. We will still reset the retry_interval on success.
retry_interval on success/failure.
Example usage: Example usage:
@ -75,13 +73,12 @@ def get_retry_limiter(destination, clock, store, ignore_backoff=False,
""" """
retry_last_ts, retry_interval = (0, 0) retry_last_ts, retry_interval = (0, 0)
retry_timings = yield store.get_destination_retry_timings( retry_timings = yield store.get_destination_retry_timings(destination)
destination
)
if retry_timings: if retry_timings:
retry_last_ts, retry_interval = ( retry_last_ts, retry_interval = (
retry_timings["retry_last_ts"], retry_timings["retry_interval"] retry_timings["retry_last_ts"],
retry_timings["retry_interval"],
) )
now = int(clock.time_msec()) now = int(clock.time_msec())
@ -93,22 +90,31 @@ def get_retry_limiter(destination, clock, store, ignore_backoff=False,
destination=destination, destination=destination,
) )
# if we are ignoring the backoff data, we should also not increment the backoff
# when we get another failure - otherwise a server can very quickly reach the
# maximum backoff even though it might only have been down briefly
backoff_on_failure = not ignore_backoff
defer.returnValue( defer.returnValue(
RetryDestinationLimiter( RetryDestinationLimiter(
destination, destination, clock, store, retry_interval, backoff_on_failure, **kwargs
clock,
store,
retry_interval,
**kwargs
) )
) )
class RetryDestinationLimiter(object): class RetryDestinationLimiter(object):
def __init__(self, destination, clock, store, retry_interval, def __init__(
min_retry_interval=10 * 60 * 1000, self,
max_retry_interval=24 * 60 * 60 * 1000, destination,
multiplier_retry_interval=5, backoff_on_404=False): clock,
store,
retry_interval,
min_retry_interval=10 * 60 * 1000,
max_retry_interval=24 * 60 * 60 * 1000,
multiplier_retry_interval=5,
backoff_on_404=False,
backoff_on_failure=True,
):
"""Marks the destination as "down" if an exception is thrown in the """Marks the destination as "down" if an exception is thrown in the
context, except for CodeMessageException with code < 500. context, except for CodeMessageException with code < 500.
@ -128,6 +134,9 @@ class RetryDestinationLimiter(object):
multiplier_retry_interval (int): The multiplier to use to increase multiplier_retry_interval (int): The multiplier to use to increase
the retry interval after a failed request. the retry interval after a failed request.
backoff_on_404 (bool): Back off if we get a 404 backoff_on_404 (bool): Back off if we get a 404
backoff_on_failure (bool): set to False if we should not increase the
retry interval on a failure.
""" """
self.clock = clock self.clock = clock
self.store = store self.store = store
@ -138,6 +147,7 @@ class RetryDestinationLimiter(object):
self.max_retry_interval = max_retry_interval self.max_retry_interval = max_retry_interval
self.multiplier_retry_interval = multiplier_retry_interval self.multiplier_retry_interval = multiplier_retry_interval
self.backoff_on_404 = backoff_on_404 self.backoff_on_404 = backoff_on_404
self.backoff_on_failure = backoff_on_failure
def __enter__(self): def __enter__(self):
pass pass
@ -173,10 +183,13 @@ class RetryDestinationLimiter(object):
if not self.retry_interval: if not self.retry_interval:
return return
logger.debug("Connection to %s was successful; clearing backoff", logger.debug(
self.destination) "Connection to %s was successful; clearing backoff", self.destination
)
retry_last_ts = 0 retry_last_ts = 0
self.retry_interval = 0 self.retry_interval = 0
elif not self.backoff_on_failure:
return
else: else:
# We couldn't connect. # We couldn't connect.
if self.retry_interval: if self.retry_interval:
@ -190,7 +203,10 @@ class RetryDestinationLimiter(object):
logger.info( logger.info(
"Connection to %s was unsuccessful (%s(%s)); backoff now %i", "Connection to %s was unsuccessful (%s(%s)); backoff now %i",
self.destination, exc_type, exc_val, self.retry_interval self.destination,
exc_type,
exc_val,
self.retry_interval,
) )
retry_last_ts = int(self.clock.time_msec()) retry_last_ts = int(self.clock.time_msec())
@ -201,9 +217,7 @@ class RetryDestinationLimiter(object):
self.destination, retry_last_ts, self.retry_interval self.destination, retry_last_ts, self.retry_interval
) )
except Exception: except Exception:
logger.exception( logger.exception("Failed to store destination_retry_timings")
"Failed to store destination_retry_timings",
)
# we deliberately do this in the background. # we deliberately do this in the background.
synapse.util.logcontext.run_in_background(store_retry_timings) synapse.util.logcontext.run_in_background(store_retry_timings)