2016-01-06 23:26:29 -05:00
|
|
|
# Copyright 2015, 2016 OpenMarket Ltd
|
2015-02-17 12:20:56 -05:00
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
import logging
|
2015-11-02 11:49:05 -05:00
|
|
|
import random
|
2021-09-10 12:03:18 -04:00
|
|
|
from types import TracebackType
|
|
|
|
from typing import Any, Optional, Type
|
2015-02-17 12:20:56 -05:00
|
|
|
|
2019-07-03 10:07:04 -04:00
|
|
|
import synapse.logging.context
|
2018-07-09 02:09:20 -04:00
|
|
|
from synapse.api.errors import CodeMessageException
|
2021-09-10 12:03:18 -04:00
|
|
|
from synapse.storage import DataStore
|
|
|
|
from synapse.util import Clock
|
2015-02-17 12:20:56 -05:00
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
2020-07-09 09:52:58 -04:00
|
|
|
# the initial backoff, after the first transaction fails
|
2019-09-11 10:14:56 -04:00
|
|
|
MIN_RETRY_INTERVAL = 10 * 60 * 1000
|
|
|
|
|
|
|
|
# how much we multiply the backoff by after each subsequent fail
|
|
|
|
RETRY_MULTIPLIER = 5
|
|
|
|
|
2019-09-12 08:00:13 -04:00
|
|
|
# a cap on the backoff. (Essentially none)
|
2019-10-02 05:36:27 -04:00
|
|
|
MAX_RETRY_INTERVAL = 2 ** 62
|
2019-09-11 10:14:56 -04:00
|
|
|
|
2015-02-17 12:20:56 -05:00
|
|
|
|
|
|
|
class NotRetryingDestination(Exception):
|
2021-09-10 12:03:18 -04:00
|
|
|
def __init__(self, retry_last_ts: int, retry_interval: int, destination: str):
|
2018-01-22 13:11:18 -05:00
|
|
|
"""Raised by the limiter (and federation client) to indicate that we are
|
|
|
|
are deliberately not attempting to contact a given server.
|
|
|
|
|
|
|
|
Args:
|
2021-09-10 12:03:18 -04:00
|
|
|
retry_last_ts: the unix ts in milliseconds of our last attempt
|
2018-01-22 13:11:18 -05:00
|
|
|
to contact the server. 0 indicates that the last attempt was
|
|
|
|
successful or that we've never actually attempted to connect.
|
2021-09-10 12:03:18 -04:00
|
|
|
retry_interval: the time in milliseconds to wait until the next
|
2018-01-22 13:11:18 -05:00
|
|
|
attempt.
|
2021-09-10 12:03:18 -04:00
|
|
|
destination: the domain in question
|
2018-01-22 13:11:18 -05:00
|
|
|
"""
|
|
|
|
|
2015-02-17 12:20:56 -05:00
|
|
|
msg = "Not retrying server %s." % (destination,)
|
2020-09-18 09:56:44 -04:00
|
|
|
super().__init__(msg)
|
2015-02-17 12:20:56 -05:00
|
|
|
|
|
|
|
self.retry_last_ts = retry_last_ts
|
|
|
|
self.retry_interval = retry_interval
|
|
|
|
self.destination = destination
|
|
|
|
|
|
|
|
|
2021-09-10 12:03:18 -04:00
|
|
|
async def get_retry_limiter(
|
|
|
|
destination: str,
|
|
|
|
clock: Clock,
|
|
|
|
store: DataStore,
|
|
|
|
ignore_backoff: bool = False,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> "RetryDestinationLimiter":
|
2015-02-18 05:09:54 -05:00
|
|
|
"""For a given destination check if we have previously failed to
|
|
|
|
send a request there and are waiting before retrying the destination.
|
|
|
|
If we are not ready to retry the destination, this will raise a
|
|
|
|
NotRetryingDestination exception. Otherwise, will return a Context Manager
|
|
|
|
that will mark the destination as down if an exception is thrown (excluding
|
|
|
|
CodeMessageException with code < 500)
|
|
|
|
|
2017-03-23 07:10:36 -04:00
|
|
|
Args:
|
2021-09-10 12:03:18 -04:00
|
|
|
destination: name of homeserver
|
|
|
|
clock: timing source
|
|
|
|
store: datastore
|
|
|
|
ignore_backoff: true to ignore the historical backoff data and
|
2019-06-03 18:50:43 -04:00
|
|
|
try the request anyway. We will still reset the retry_interval on success.
|
2017-03-23 07:10:36 -04:00
|
|
|
|
2015-02-18 05:09:54 -05:00
|
|
|
Example usage:
|
|
|
|
|
|
|
|
try:
|
2020-08-06 08:39:35 -04:00
|
|
|
limiter = await get_retry_limiter(destination, clock, store)
|
2015-02-18 05:09:54 -05:00
|
|
|
with limiter:
|
2020-08-06 08:39:35 -04:00
|
|
|
response = await do_request()
|
2015-02-18 05:09:54 -05:00
|
|
|
except NotRetryingDestination:
|
|
|
|
# We aren't ready to retry that destination.
|
|
|
|
raise
|
|
|
|
"""
|
2019-09-17 06:41:54 -04:00
|
|
|
failure_ts = None
|
2015-02-17 12:20:56 -05:00
|
|
|
retry_last_ts, retry_interval = (0, 0)
|
|
|
|
|
2020-08-06 08:39:35 -04:00
|
|
|
retry_timings = await store.get_destination_retry_timings(destination)
|
2015-02-17 12:20:56 -05:00
|
|
|
|
|
|
|
if retry_timings:
|
2021-05-21 12:57:08 -04:00
|
|
|
failure_ts = retry_timings.failure_ts
|
|
|
|
retry_last_ts = retry_timings.retry_last_ts
|
|
|
|
retry_interval = retry_timings.retry_interval
|
2015-02-17 12:20:56 -05:00
|
|
|
|
|
|
|
now = int(clock.time_msec())
|
|
|
|
|
2017-03-23 07:10:36 -04:00
|
|
|
if not ignore_backoff and retry_last_ts + retry_interval > now:
|
2015-02-17 12:20:56 -05:00
|
|
|
raise NotRetryingDestination(
|
|
|
|
retry_last_ts=retry_last_ts,
|
|
|
|
retry_interval=retry_interval,
|
|
|
|
destination=destination,
|
|
|
|
)
|
|
|
|
|
2019-06-03 18:50:43 -04:00
|
|
|
# if we are ignoring the backoff data, we should also not increment the backoff
|
|
|
|
# when we get another failure - otherwise a server can very quickly reach the
|
|
|
|
# maximum backoff even though it might only have been down briefly
|
|
|
|
backoff_on_failure = not ignore_backoff
|
|
|
|
|
2019-07-23 09:00:55 -04:00
|
|
|
return RetryDestinationLimiter(
|
|
|
|
destination,
|
|
|
|
clock,
|
|
|
|
store,
|
2019-09-17 06:41:54 -04:00
|
|
|
failure_ts,
|
2019-07-23 09:00:55 -04:00
|
|
|
retry_interval,
|
|
|
|
backoff_on_failure=backoff_on_failure,
|
2020-10-27 19:26:36 -04:00
|
|
|
**kwargs,
|
2015-02-17 12:20:56 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
|
2020-09-04 06:54:56 -04:00
|
|
|
class RetryDestinationLimiter:
|
2019-06-03 18:50:43 -04:00
|
|
|
def __init__(
|
|
|
|
self,
|
2021-09-10 12:03:18 -04:00
|
|
|
destination: str,
|
|
|
|
clock: Clock,
|
|
|
|
store: DataStore,
|
|
|
|
failure_ts: Optional[int],
|
|
|
|
retry_interval: int,
|
|
|
|
backoff_on_404: bool = False,
|
|
|
|
backoff_on_failure: bool = True,
|
2019-06-03 18:50:43 -04:00
|
|
|
):
|
2015-02-18 05:11:24 -05:00
|
|
|
"""Marks the destination as "down" if an exception is thrown in the
|
|
|
|
context, except for CodeMessageException with code < 500.
|
|
|
|
|
|
|
|
If no exception is raised, marks the destination as "up".
|
|
|
|
|
2015-02-18 05:09:54 -05:00
|
|
|
Args:
|
2021-09-10 12:03:18 -04:00
|
|
|
destination
|
|
|
|
clock
|
|
|
|
store
|
|
|
|
failure_ts: when this destination started failing (in ms since
|
2019-09-17 06:41:54 -04:00
|
|
|
the epoch), or zero if the last request was successful
|
2021-09-10 12:03:18 -04:00
|
|
|
retry_interval: The next retry interval taken from the
|
2015-02-18 05:09:54 -05:00
|
|
|
database in milliseconds, or zero if the last request was
|
|
|
|
successful.
|
2021-09-10 12:03:18 -04:00
|
|
|
backoff_on_404: Back off if we get a 404
|
2019-06-03 18:50:43 -04:00
|
|
|
|
2021-09-10 12:03:18 -04:00
|
|
|
backoff_on_failure: set to False if we should not increase the
|
2019-06-03 18:50:43 -04:00
|
|
|
retry interval on a failure.
|
2015-02-18 05:09:54 -05:00
|
|
|
"""
|
2015-02-17 12:20:56 -05:00
|
|
|
self.clock = clock
|
|
|
|
self.store = store
|
|
|
|
self.destination = destination
|
|
|
|
|
2019-09-17 06:41:54 -04:00
|
|
|
self.failure_ts = failure_ts
|
2015-02-17 12:20:56 -05:00
|
|
|
self.retry_interval = retry_interval
|
2017-01-31 08:40:09 -05:00
|
|
|
self.backoff_on_404 = backoff_on_404
|
2019-06-03 18:50:43 -04:00
|
|
|
self.backoff_on_failure = backoff_on_failure
|
2015-02-17 12:20:56 -05:00
|
|
|
|
2021-09-10 12:03:18 -04:00
|
|
|
def __enter__(self) -> None:
|
2015-02-17 12:20:56 -05:00
|
|
|
pass
|
|
|
|
|
2021-09-10 12:03:18 -04:00
|
|
|
def __exit__(
|
|
|
|
self,
|
|
|
|
exc_type: Optional[Type[BaseException]],
|
|
|
|
exc_val: Optional[BaseException],
|
|
|
|
exc_tb: Optional[TracebackType],
|
|
|
|
) -> None:
|
2015-02-17 13:13:34 -05:00
|
|
|
valid_err_code = False
|
2017-03-22 20:12:21 -04:00
|
|
|
if exc_type is None:
|
|
|
|
valid_err_code = True
|
|
|
|
elif not issubclass(exc_type, Exception):
|
|
|
|
# avoid treating exceptions which don't derive from Exception as
|
|
|
|
# failures; this is mostly so as not to catch defer._DefGen.
|
|
|
|
valid_err_code = True
|
2021-09-10 12:03:18 -04:00
|
|
|
elif isinstance(exc_val, CodeMessageException):
|
2017-01-31 08:46:38 -05:00
|
|
|
# Some error codes are perfectly fine for some APIs, whereas other
|
|
|
|
# APIs may expect to never received e.g. a 404. It's important to
|
|
|
|
# handle 404 as some remote servers will return a 404 when the HS
|
|
|
|
# has been decommissioned.
|
2017-02-01 10:42:19 -05:00
|
|
|
# If we get a 401, then we should probably back off since they
|
|
|
|
# won't accept our requests for at least a while.
|
2020-07-09 09:52:58 -04:00
|
|
|
# 429 is us being aggressively rate limited, so lets rate limit
|
2017-02-01 10:42:19 -05:00
|
|
|
# ourselves.
|
2017-01-31 10:21:32 -05:00
|
|
|
if exc_val.code == 404 and self.backoff_on_404:
|
2017-01-31 08:40:09 -05:00
|
|
|
valid_err_code = False
|
2017-02-01 10:42:19 -05:00
|
|
|
elif exc_val.code in (401, 429):
|
2017-01-31 08:40:09 -05:00
|
|
|
valid_err_code = False
|
|
|
|
elif exc_val.code < 500:
|
|
|
|
valid_err_code = True
|
|
|
|
else:
|
|
|
|
valid_err_code = False
|
2015-02-17 13:13:34 -05:00
|
|
|
|
2017-03-22 20:12:21 -04:00
|
|
|
if valid_err_code:
|
2015-02-17 12:20:56 -05:00
|
|
|
# We connected successfully.
|
2015-02-17 12:38:38 -05:00
|
|
|
if not self.retry_interval:
|
|
|
|
return
|
|
|
|
|
2019-06-03 18:50:43 -04:00
|
|
|
logger.debug(
|
|
|
|
"Connection to %s was successful; clearing backoff", self.destination
|
|
|
|
)
|
2019-09-17 06:41:54 -04:00
|
|
|
self.failure_ts = None
|
2015-02-17 12:20:56 -05:00
|
|
|
retry_last_ts = 0
|
|
|
|
self.retry_interval = 0
|
2019-06-03 18:50:43 -04:00
|
|
|
elif not self.backoff_on_failure:
|
|
|
|
return
|
2015-02-17 12:20:56 -05:00
|
|
|
else:
|
|
|
|
# We couldn't connect.
|
|
|
|
if self.retry_interval:
|
2019-09-12 07:59:43 -04:00
|
|
|
self.retry_interval = int(
|
|
|
|
self.retry_interval * RETRY_MULTIPLIER * random.uniform(0.8, 1.4)
|
|
|
|
)
|
2015-02-17 12:20:56 -05:00
|
|
|
|
2019-09-11 10:14:56 -04:00
|
|
|
if self.retry_interval >= MAX_RETRY_INTERVAL:
|
|
|
|
self.retry_interval = MAX_RETRY_INTERVAL
|
2015-02-17 12:20:56 -05:00
|
|
|
else:
|
2019-09-11 10:14:56 -04:00
|
|
|
self.retry_interval = MIN_RETRY_INTERVAL
|
2015-02-17 12:20:56 -05:00
|
|
|
|
2018-09-19 12:28:18 -04:00
|
|
|
logger.info(
|
2017-03-22 20:12:21 -04:00
|
|
|
"Connection to %s was unsuccessful (%s(%s)); backoff now %i",
|
2019-06-03 18:50:43 -04:00
|
|
|
self.destination,
|
|
|
|
exc_type,
|
|
|
|
exc_val,
|
|
|
|
self.retry_interval,
|
2017-03-22 20:12:21 -04:00
|
|
|
)
|
2015-02-18 06:25:20 -05:00
|
|
|
retry_last_ts = int(self.clock.time_msec())
|
2015-02-17 12:20:56 -05:00
|
|
|
|
2019-09-17 06:41:54 -04:00
|
|
|
if self.failure_ts is None:
|
|
|
|
self.failure_ts = retry_last_ts
|
|
|
|
|
2021-09-10 12:03:18 -04:00
|
|
|
async def store_retry_timings() -> None:
|
2016-11-22 12:45:44 -05:00
|
|
|
try:
|
2020-08-06 08:39:35 -04:00
|
|
|
await self.store.set_destination_retry_timings(
|
2019-09-17 06:41:54 -04:00
|
|
|
self.destination,
|
|
|
|
self.failure_ts,
|
|
|
|
retry_last_ts,
|
|
|
|
self.retry_interval,
|
2016-11-22 12:45:44 -05:00
|
|
|
)
|
2017-10-23 10:52:32 -04:00
|
|
|
except Exception:
|
2019-06-03 18:50:43 -04:00
|
|
|
logger.exception("Failed to store destination_retry_timings")
|
2016-11-22 12:45:44 -05:00
|
|
|
|
2017-03-22 20:17:46 -04:00
|
|
|
# we deliberately do this in the background.
|
2019-07-03 10:07:04 -04:00
|
|
|
synapse.logging.context.run_in_background(store_retry_timings)
|