2016-01-06 23:26:29 -05:00
# Copyright 2015, 2016 OpenMarket Ltd
2015-02-17 12:20:56 -05:00
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
2015-11-02 11:49:05 -05:00
import random
2021-09-10 12:03:18 -04:00
from types import TracebackType
2022-05-10 05:39:54 -04:00
from typing import TYPE_CHECKING , Any , Optional , Type
2015-02-17 12:20:56 -05:00
2018-07-09 02:09:20 -04:00
from synapse . api . errors import CodeMessageException
2022-05-19 15:17:10 -04:00
from synapse . metrics . background_process_metrics import run_as_background_process
2021-09-10 12:03:18 -04:00
from synapse . storage import DataStore
2023-09-04 12:14:09 -04:00
from synapse . types import StrCollection
2021-09-10 12:03:18 -04:00
from synapse . util import Clock
2015-02-17 12:20:56 -05:00
2022-05-10 05:39:54 -04:00
if TYPE_CHECKING :
from synapse . notifier import Notifier
from synapse . replication . tcp . handler import ReplicationCommandHandler
2015-02-17 12:20:56 -05:00
logger = logging . getLogger ( __name__ )
class NotRetryingDestination ( Exception ) :
2021-09-10 12:03:18 -04:00
def __init__ ( self , retry_last_ts : int , retry_interval : int , destination : str ) :
2018-01-22 13:11:18 -05:00
""" Raised by the limiter (and federation client) to indicate that we are
are deliberately not attempting to contact a given server .
Args :
2021-09-10 12:03:18 -04:00
retry_last_ts : the unix ts in milliseconds of our last attempt
2018-01-22 13:11:18 -05:00
to contact the server . 0 indicates that the last attempt was
successful or that we ' ve never actually attempted to connect.
2021-09-10 12:03:18 -04:00
retry_interval : the time in milliseconds to wait until the next
2018-01-22 13:11:18 -05:00
attempt .
2021-09-10 12:03:18 -04:00
destination : the domain in question
2018-01-22 13:11:18 -05:00
"""
2022-10-26 17:10:55 -04:00
msg = f " Not retrying server { destination } because we tried it recently retry_last_ts= { retry_last_ts } and we won ' t check for another retry_interval= { retry_interval } ms. "
2020-09-18 09:56:44 -04:00
super ( ) . __init__ ( msg )
2015-02-17 12:20:56 -05:00
self . retry_last_ts = retry_last_ts
self . retry_interval = retry_interval
self . destination = destination
2021-09-10 12:03:18 -04:00
async def get_retry_limiter (
destination : str ,
clock : Clock ,
store : DataStore ,
ignore_backoff : bool = False ,
* * kwargs : Any ,
) - > " RetryDestinationLimiter " :
2015-02-18 05:09:54 -05:00
""" For a given destination check if we have previously failed to
send a request there and are waiting before retrying the destination .
If we are not ready to retry the destination , this will raise a
NotRetryingDestination exception . Otherwise , will return a Context Manager
that will mark the destination as down if an exception is thrown ( excluding
CodeMessageException with code < 500 )
2017-03-23 07:10:36 -04:00
Args :
2021-09-10 12:03:18 -04:00
destination : name of homeserver
clock : timing source
store : datastore
ignore_backoff : true to ignore the historical backoff data and
2019-06-03 18:50:43 -04:00
try the request anyway . We will still reset the retry_interval on success .
2017-03-23 07:10:36 -04:00
2015-02-18 05:09:54 -05:00
Example usage :
try :
2020-08-06 08:39:35 -04:00
limiter = await get_retry_limiter ( destination , clock , store )
2015-02-18 05:09:54 -05:00
with limiter :
2020-08-06 08:39:35 -04:00
response = await do_request ( )
2015-02-18 05:09:54 -05:00
except NotRetryingDestination :
# We aren't ready to retry that destination.
raise
"""
2019-09-17 06:41:54 -04:00
failure_ts = None
2015-02-17 12:20:56 -05:00
retry_last_ts , retry_interval = ( 0 , 0 )
2020-08-06 08:39:35 -04:00
retry_timings = await store . get_destination_retry_timings ( destination )
2015-02-17 12:20:56 -05:00
if retry_timings :
2021-05-21 12:57:08 -04:00
failure_ts = retry_timings . failure_ts
retry_last_ts = retry_timings . retry_last_ts
retry_interval = retry_timings . retry_interval
2015-02-17 12:20:56 -05:00
now = int ( clock . time_msec ( ) )
2017-03-23 07:10:36 -04:00
if not ignore_backoff and retry_last_ts + retry_interval > now :
2015-02-17 12:20:56 -05:00
raise NotRetryingDestination (
retry_last_ts = retry_last_ts ,
retry_interval = retry_interval ,
destination = destination ,
)
2019-06-03 18:50:43 -04:00
# if we are ignoring the backoff data, we should also not increment the backoff
# when we get another failure - otherwise a server can very quickly reach the
# maximum backoff even though it might only have been down briefly
backoff_on_failure = not ignore_backoff
2019-07-23 09:00:55 -04:00
return RetryDestinationLimiter (
destination ,
clock ,
store ,
2019-09-17 06:41:54 -04:00
failure_ts ,
2019-07-23 09:00:55 -04:00
retry_interval ,
backoff_on_failure = backoff_on_failure ,
* * kwargs ,
2015-02-17 12:20:56 -05:00
)
2023-09-04 12:14:09 -04:00
async def filter_destinations_by_retry_limiter (
destinations : StrCollection ,
clock : Clock ,
store : DataStore ,
retry_due_within_ms : int = 0 ,
) - > StrCollection :
""" Filter down the list of destinations to only those that will are either
alive or due for a retry ( within ` retry_due_within_ms ` )
"""
if not destinations :
return destinations
retry_timings = await store . get_destination_retry_timings_batch ( destinations )
now = int ( clock . time_msec ( ) )
return [
destination
for destination , timings in retry_timings . items ( )
if timings is None
or timings . retry_last_ts + timings . retry_interval < = now + retry_due_within_ms
]
2020-09-04 06:54:56 -04:00
class RetryDestinationLimiter :
2019-06-03 18:50:43 -04:00
def __init__ (
self ,
2021-09-10 12:03:18 -04:00
destination : str ,
clock : Clock ,
store : DataStore ,
failure_ts : Optional [ int ] ,
retry_interval : int ,
backoff_on_404 : bool = False ,
backoff_on_failure : bool = True ,
2022-05-10 05:39:54 -04:00
notifier : Optional [ " Notifier " ] = None ,
replication_client : Optional [ " ReplicationCommandHandler " ] = None ,
2023-09-04 09:04:43 -04:00
backoff_on_all_error_codes : bool = False ,
2019-06-03 18:50:43 -04:00
) :
2015-02-18 05:11:24 -05:00
""" Marks the destination as " down " if an exception is thrown in the
context , except for CodeMessageException with code < 500.
If no exception is raised , marks the destination as " up " .
2015-02-18 05:09:54 -05:00
Args :
2021-09-10 12:03:18 -04:00
destination
clock
store
failure_ts : when this destination started failing ( in ms since
2019-09-17 06:41:54 -04:00
the epoch ) , or zero if the last request was successful
2021-09-10 12:03:18 -04:00
retry_interval : The next retry interval taken from the
2015-02-18 05:09:54 -05:00
database in milliseconds , or zero if the last request was
successful .
2021-09-10 12:03:18 -04:00
backoff_on_404 : Back off if we get a 404
backoff_on_failure : set to False if we should not increase the
2019-06-03 18:50:43 -04:00
retry interval on a failure .
2023-10-17 07:32:40 -04:00
notifier : A notifier used to mark servers as up .
replication_client A replication client used to mark servers as up .
2023-09-04 09:04:43 -04:00
backoff_on_all_error_codes : Whether we should back off on any
error code .
2015-02-18 05:09:54 -05:00
"""
2015-02-17 12:20:56 -05:00
self . clock = clock
self . store = store
self . destination = destination
2019-09-17 06:41:54 -04:00
self . failure_ts = failure_ts
2015-02-17 12:20:56 -05:00
self . retry_interval = retry_interval
2017-01-31 08:40:09 -05:00
self . backoff_on_404 = backoff_on_404
2019-06-03 18:50:43 -04:00
self . backoff_on_failure = backoff_on_failure
2023-09-04 09:04:43 -04:00
self . backoff_on_all_error_codes = backoff_on_all_error_codes
2015-02-17 12:20:56 -05:00
2022-05-10 05:39:54 -04:00
self . notifier = notifier
self . replication_client = replication_client
2023-08-03 14:36:55 -04:00
self . destination_min_retry_interval_ms = (
self . store . hs . config . federation . destination_min_retry_interval_ms
)
self . destination_retry_multiplier = (
self . store . hs . config . federation . destination_retry_multiplier
)
self . destination_max_retry_interval_ms = (
self . store . hs . config . federation . destination_max_retry_interval_ms
)
2021-09-10 12:03:18 -04:00
def __enter__ ( self ) - > None :
2015-02-17 12:20:56 -05:00
pass
2021-09-10 12:03:18 -04:00
def __exit__ (
self ,
exc_type : Optional [ Type [ BaseException ] ] ,
exc_val : Optional [ BaseException ] ,
exc_tb : Optional [ TracebackType ] ,
) - > None :
2023-09-04 09:04:43 -04:00
success = exc_type is None
2015-02-17 13:13:34 -05:00
valid_err_code = False
2017-03-22 20:12:21 -04:00
if exc_type is None :
valid_err_code = True
elif not issubclass ( exc_type , Exception ) :
# avoid treating exceptions which don't derive from Exception as
# failures; this is mostly so as not to catch defer._DefGen.
valid_err_code = True
2021-09-10 12:03:18 -04:00
elif isinstance ( exc_val , CodeMessageException ) :
2017-01-31 08:46:38 -05:00
# Some error codes are perfectly fine for some APIs, whereas other
# APIs may expect to never received e.g. a 404. It's important to
# handle 404 as some remote servers will return a 404 when the HS
# has been decommissioned.
2017-02-01 10:42:19 -05:00
# If we get a 401, then we should probably back off since they
# won't accept our requests for at least a while.
2020-07-09 09:52:58 -04:00
# 429 is us being aggressively rate limited, so lets rate limit
2017-02-01 10:42:19 -05:00
# ourselves.
2023-09-04 09:04:43 -04:00
if self . backoff_on_all_error_codes :
valid_err_code = False
elif exc_val . code == 404 and self . backoff_on_404 :
2017-01-31 08:40:09 -05:00
valid_err_code = False
2017-02-01 10:42:19 -05:00
elif exc_val . code in ( 401 , 429 ) :
2017-01-31 08:40:09 -05:00
valid_err_code = False
elif exc_val . code < 500 :
valid_err_code = True
else :
valid_err_code = False
2015-02-17 13:13:34 -05:00
2023-10-17 07:32:40 -04:00
# Whether previous requests to the destination had been failing.
previously_failing = bool ( self . failure_ts )
2023-09-04 09:04:43 -04:00
if success :
2015-02-17 12:20:56 -05:00
# We connected successfully.
2015-02-17 12:38:38 -05:00
if not self . retry_interval :
return
2019-06-03 18:50:43 -04:00
logger . debug (
" Connection to %s was successful; clearing backoff " , self . destination
)
2019-09-17 06:41:54 -04:00
self . failure_ts = None
2015-02-17 12:20:56 -05:00
retry_last_ts = 0
self . retry_interval = 0
2023-09-04 09:04:43 -04:00
elif valid_err_code :
# We got a potentially valid error code back. We don't reset the
# timers though, as the other side might actually be down anyway
# (e.g. some deprovisioned servers will always return a 404 or 403,
# and we don't want to keep resetting the retry timers for them).
return
2019-06-03 18:50:43 -04:00
elif not self . backoff_on_failure :
return
2015-02-17 12:20:56 -05:00
else :
# We couldn't connect.
if self . retry_interval :
2019-09-12 07:59:43 -04:00
self . retry_interval = int (
2023-08-03 14:36:55 -04:00
self . retry_interval
* self . destination_retry_multiplier
* random . uniform ( 0.8 , 1.4 )
2019-09-12 07:59:43 -04:00
)
2015-02-17 12:20:56 -05:00
2023-08-03 14:36:55 -04:00
if self . retry_interval > = self . destination_max_retry_interval_ms :
self . retry_interval = self . destination_max_retry_interval_ms
2015-02-17 12:20:56 -05:00
else :
2023-08-03 14:36:55 -04:00
self . retry_interval = self . destination_min_retry_interval_ms
2015-02-17 12:20:56 -05:00
2018-09-19 12:28:18 -04:00
logger . info (
2017-03-22 20:12:21 -04:00
" Connection to %s was unsuccessful ( %s ( %s )); backoff now %i " ,
2019-06-03 18:50:43 -04:00
self . destination ,
exc_type ,
exc_val ,
self . retry_interval ,
2017-03-22 20:12:21 -04:00
)
2015-02-18 06:25:20 -05:00
retry_last_ts = int ( self . clock . time_msec ( ) )
2015-02-17 12:20:56 -05:00
2019-09-17 06:41:54 -04:00
if self . failure_ts is None :
self . failure_ts = retry_last_ts
2023-10-17 07:32:40 -04:00
# Whether the current request to the destination had been failing.
currently_failing = bool ( self . failure_ts )
2021-09-10 12:03:18 -04:00
async def store_retry_timings ( ) - > None :
2016-11-22 12:45:44 -05:00
try :
2020-08-06 08:39:35 -04:00
await self . store . set_destination_retry_timings (
2019-09-17 06:41:54 -04:00
self . destination ,
self . failure_ts ,
retry_last_ts ,
self . retry_interval ,
2016-11-22 12:45:44 -05:00
)
2022-05-10 05:39:54 -04:00
2023-10-17 07:32:40 -04:00
# If the server was previously failing, but is no longer.
if previously_failing and not currently_failing :
if self . notifier :
# Inform the relevant places that the remote server is back up.
self . notifier . notify_remote_server_up ( self . destination )
if self . replication_client :
# Inform other workers that the remote server is up.
self . replication_client . send_remote_server_up ( self . destination )
2022-05-10 05:39:54 -04:00
2017-10-23 10:52:32 -04:00
except Exception :
2019-06-03 18:50:43 -04:00
logger . exception ( " Failed to store destination_retry_timings " )
2016-11-22 12:45:44 -05:00
2017-03-22 20:17:46 -04:00
# we deliberately do this in the background.
2022-05-19 15:17:10 -04:00
run_as_background_process ( " store_retry_timings " , store_retry_timings )