Fix spinloop during partial state sync when a prev event is in backoff (#15351)

Previously, we would spin in a tight loop until
`update_state_for_partial_state_event` stopped raising
`FederationPullAttemptBackoffError`s. Replace the spinloop with a wait
until the backoff period has expired.

Signed-off-by: Sean Quah <seanq@matrix.org>
This commit is contained in:
Sean Quah 2023-03-30 13:36:41 +01:00 committed by GitHub
parent a3bad89d57
commit d9f694932c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 80 additions and 48 deletions

1
changelog.d/15351.bugfix Normal file
View File

@ -0,0 +1 @@
Fix a bug introduced in Synapse 1.70.0 where the background sync from a faster join could spin for hours when one of the events involved had been marked for backoff.

View File

@ -27,7 +27,7 @@ from synapse.util import json_decoder
if typing.TYPE_CHECKING: if typing.TYPE_CHECKING:
from synapse.config.homeserver import HomeServerConfig from synapse.config.homeserver import HomeServerConfig
from synapse.types import JsonDict from synapse.types import JsonDict, StrCollection
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -682,18 +682,27 @@ class FederationPullAttemptBackoffError(RuntimeError):
Attributes: Attributes:
event_id: The event_id which we are refusing to pull event_id: The event_id which we are refusing to pull
message: A custom error message that gives more context message: A custom error message that gives more context
retry_after_ms: The remaining backoff interval, in milliseconds
""" """
def __init__(self, event_ids: List[str], message: Optional[str]): def __init__(
self.event_ids = event_ids self, event_ids: "StrCollection", message: Optional[str], retry_after_ms: int
):
event_ids = list(event_ids)
if message: if message:
error_message = message error_message = message
else: else:
error_message = f"Not attempting to pull event_ids={self.event_ids} because we already tried to pull them recently (backing off)." error_message = (
f"Not attempting to pull event_ids={event_ids} because we already "
"tried to pull them recently (backing off)."
)
super().__init__(error_message) super().__init__(error_message)
self.event_ids = event_ids
self.retry_after_ms = retry_after_ms
class HttpResponseException(CodeMessageException): class HttpResponseException(CodeMessageException):
""" """

View File

@ -1949,27 +1949,25 @@ class FederationHandler:
) )
for event in events: for event in events:
for attempt in itertools.count(): for attempt in itertools.count():
# We try a new destination on every iteration.
try:
while True:
try: try:
await self._federation_event_handler.update_state_for_partial_state_event( await self._federation_event_handler.update_state_for_partial_state_event(
destination, event destination, event
) )
break break
except FederationPullAttemptBackoffError as exc: except FederationPullAttemptBackoffError as e:
# Log a warning about why we failed to process the event (the error message # We are in the backoff period for one of the event's
# for `FederationPullAttemptBackoffError` is pretty good) # prev_events. Wait it out and try again after.
logger.warning("_sync_partial_state_room: %s", exc) logger.warning(
# We do not record a failed pull attempt when we backoff fetching a missing "%s; waiting for %d ms...", e, e.retry_after_ms
# `prev_event` because not being able to fetch the `prev_events` just means )
# we won't be able to de-outlier the pulled event. But we can still use an await self.clock.sleep(e.retry_after_ms / 1000)
# `outlier` in the state/auth chain for another event. So we shouldn't stop
# a downstream event from trying to pull it.
#
# This avoids a cascade of backoff for all events in the DAG downstream from
# one event backoff upstream.
except FederationError as e:
# TODO: We should `record_event_failed_pull_attempt` here,
# see https://github.com/matrix-org/synapse/issues/13700
# Success, no need to try the rest of the destinations.
break
except FederationError as e:
if attempt == len(destinations) - 1: if attempt == len(destinations) - 1:
# We have tried every remote server for this event. Give up. # We have tried every remote server for this event. Give up.
# TODO(faster_joins) giving up isn't the right thing to do # TODO(faster_joins) giving up isn't the right thing to do
@ -1986,6 +1984,8 @@ class FederationHandler:
destination, destination,
e, e,
) )
# TODO: We should `record_event_failed_pull_attempt` here,
# see https://github.com/matrix-org/synapse/issues/13700
raise raise
# Try the next remote server. # Try the next remote server.

View File

@ -140,6 +140,7 @@ class FederationEventHandler:
""" """
def __init__(self, hs: "HomeServer"): def __init__(self, hs: "HomeServer"):
self._clock = hs.get_clock()
self._store = hs.get_datastores().main self._store = hs.get_datastores().main
self._storage_controllers = hs.get_storage_controllers() self._storage_controllers = hs.get_storage_controllers()
self._state_storage_controller = self._storage_controllers.state self._state_storage_controller = self._storage_controllers.state
@ -1038,8 +1039,8 @@ class FederationEventHandler:
Raises: Raises:
FederationPullAttemptBackoffError if we are are deliberately not attempting FederationPullAttemptBackoffError if we are are deliberately not attempting
to pull the given event over federation because we've already done so to pull one of the given event's `prev_event`s over federation because
recently and are backing off. we've already done so recently and are backing off.
FederationError if we fail to get the state from the remote server after any FederationError if we fail to get the state from the remote server after any
missing `prev_event`s. missing `prev_event`s.
""" """
@ -1053,13 +1054,22 @@ class FederationEventHandler:
# If we've already recently attempted to pull this missing event, don't # If we've already recently attempted to pull this missing event, don't
# try it again so soon. Since we have to fetch all of the prev_events, we can # try it again so soon. Since we have to fetch all of the prev_events, we can
# bail early here if we find any to ignore. # bail early here if we find any to ignore.
prevs_to_ignore = await self._store.get_event_ids_to_not_pull_from_backoff( prevs_with_pull_backoff = (
await self._store.get_event_ids_to_not_pull_from_backoff(
room_id, missing_prevs room_id, missing_prevs
) )
if len(prevs_to_ignore) > 0: )
if len(prevs_with_pull_backoff) > 0:
raise FederationPullAttemptBackoffError( raise FederationPullAttemptBackoffError(
event_ids=prevs_to_ignore, event_ids=prevs_with_pull_backoff.keys(),
message=f"While computing context for event={event_id}, not attempting to pull missing prev_event={prevs_to_ignore[0]} because we already tried to pull recently (backing off).", message=(
f"While computing context for event={event_id}, not attempting to "
f"pull missing prev_events={list(prevs_with_pull_backoff.keys())} "
"because we already tried to pull recently (backing off)."
),
retry_after_ms=(
max(prevs_with_pull_backoff.values()) - self._clock.time_msec()
),
) )
if not missing_prevs: if not missing_prevs:

View File

@ -1544,7 +1544,7 @@ class EventFederationWorkerStore(SignatureWorkerStore, EventsWorkerStore, SQLBas
self, self,
room_id: str, room_id: str,
event_ids: Collection[str], event_ids: Collection[str],
) -> List[str]: ) -> Dict[str, int]:
""" """
Filter down the events to ones that we've failed to pull before recently. Uses Filter down the events to ones that we've failed to pull before recently. Uses
exponential backoff. exponential backoff.
@ -1554,7 +1554,8 @@ class EventFederationWorkerStore(SignatureWorkerStore, EventsWorkerStore, SQLBas
event_ids: A list of events to filter down event_ids: A list of events to filter down
Returns: Returns:
List of event_ids that should not be attempted to be pulled A dictionary of event_ids that should not be attempted to be pulled and the
next timestamp at which we may try pulling them again.
""" """
event_failed_pull_attempts = await self.db_pool.simple_select_many_batch( event_failed_pull_attempts = await self.db_pool.simple_select_many_batch(
table="event_failed_pull_attempts", table="event_failed_pull_attempts",
@ -1570,13 +1571,14 @@ class EventFederationWorkerStore(SignatureWorkerStore, EventsWorkerStore, SQLBas
) )
current_time = self._clock.time_msec() current_time = self._clock.time_msec()
return [
event_failed_pull_attempt["event_id"] event_ids_with_backoff = {}
for event_failed_pull_attempt in event_failed_pull_attempts for event_failed_pull_attempt in event_failed_pull_attempts:
event_id = event_failed_pull_attempt["event_id"]
# Exponential back-off (up to the upper bound) so we don't try to # Exponential back-off (up to the upper bound) so we don't try to
# pull the same event over and over. ex. 2hr, 4hr, 8hr, 16hr, etc. # pull the same event over and over. ex. 2hr, 4hr, 8hr, 16hr, etc.
if current_time backoff_end_time = (
< event_failed_pull_attempt["last_attempt_ts"] event_failed_pull_attempt["last_attempt_ts"]
+ ( + (
2 2
** min( ** min(
@ -1585,7 +1587,12 @@ class EventFederationWorkerStore(SignatureWorkerStore, EventsWorkerStore, SQLBas
) )
) )
* BACKFILL_EVENT_EXPONENTIAL_BACKOFF_STEP_MILLISECONDS * BACKFILL_EVENT_EXPONENTIAL_BACKOFF_STEP_MILLISECONDS
] )
if current_time < backoff_end_time: # `backoff_end_time` is exclusive
event_ids_with_backoff[event_id] = backoff_end_time
return event_ids_with_backoff
async def get_missing_events( async def get_missing_events(
self, self,

View File

@ -1143,19 +1143,24 @@ class EventFederationWorkerStoreTestCase(tests.unittest.HomeserverTestCase):
tok = self.login("alice", "test") tok = self.login("alice", "test")
room_id = self.helper.create_room_as(room_creator=user_id, tok=tok) room_id = self.helper.create_room_as(room_creator=user_id, tok=tok)
failure_time = self.clock.time_msec()
self.get_success( self.get_success(
self.store.record_event_failed_pull_attempt( self.store.record_event_failed_pull_attempt(
room_id, "$failed_event_id", "fake cause" room_id, "$failed_event_id", "fake cause"
) )
) )
event_ids_to_backoff = self.get_success( event_ids_with_backoff = self.get_success(
self.store.get_event_ids_to_not_pull_from_backoff( self.store.get_event_ids_to_not_pull_from_backoff(
room_id=room_id, event_ids=["$failed_event_id", "$normal_event_id"] room_id=room_id, event_ids=["$failed_event_id", "$normal_event_id"]
) )
) )
self.assertEqual(event_ids_to_backoff, ["$failed_event_id"]) self.assertEqual(
event_ids_with_backoff,
# We expect a 2^1 hour backoff after a single failed attempt.
{"$failed_event_id": failure_time + 2 * 60 * 60 * 1000},
)
def test_get_event_ids_to_not_pull_from_backoff_retry_after_backoff_duration( def test_get_event_ids_to_not_pull_from_backoff_retry_after_backoff_duration(
self, self,
@ -1179,14 +1184,14 @@ class EventFederationWorkerStoreTestCase(tests.unittest.HomeserverTestCase):
# attempt (2^1 hours). # attempt (2^1 hours).
self.reactor.advance(datetime.timedelta(hours=2).total_seconds()) self.reactor.advance(datetime.timedelta(hours=2).total_seconds())
event_ids_to_backoff = self.get_success( event_ids_with_backoff = self.get_success(
self.store.get_event_ids_to_not_pull_from_backoff( self.store.get_event_ids_to_not_pull_from_backoff(
room_id=room_id, event_ids=["$failed_event_id", "$normal_event_id"] room_id=room_id, event_ids=["$failed_event_id", "$normal_event_id"]
) )
) )
# Since this function only returns events we should backoff from, time has # Since this function only returns events we should backoff from, time has
# elapsed past the backoff range so there is no events to backoff from. # elapsed past the backoff range so there is no events to backoff from.
self.assertEqual(event_ids_to_backoff, []) self.assertEqual(event_ids_with_backoff, {})
@attr.s(auto_attribs=True) @attr.s(auto_attribs=True)