Fix spinloop during partial state sync when a prev event is in backoff (#15351)

Previously, we would spin in a tight loop until
`update_state_for_partial_state_event` stopped raising
`FederationPullAttemptBackoffError`s. Replace the spinloop with a wait
until the backoff period has expired.

Signed-off-by: Sean Quah <seanq@matrix.org>
This commit is contained in:
Sean Quah 2023-03-30 13:36:41 +01:00 committed by GitHub
parent a3bad89d57
commit d9f694932c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 80 additions and 48 deletions

View file

@ -1949,27 +1949,25 @@ class FederationHandler:
)
for event in events:
for attempt in itertools.count():
# We try a new destination on every iteration.
try:
await self._federation_event_handler.update_state_for_partial_state_event(
destination, event
)
break
except FederationPullAttemptBackoffError as exc:
# Log a warning about why we failed to process the event (the error message
# for `FederationPullAttemptBackoffError` is pretty good)
logger.warning("_sync_partial_state_room: %s", exc)
# We do not record a failed pull attempt when we backoff fetching a missing
# `prev_event` because not being able to fetch the `prev_events` just means
# we won't be able to de-outlier the pulled event. But we can still use an
# `outlier` in the state/auth chain for another event. So we shouldn't stop
# a downstream event from trying to pull it.
#
# This avoids a cascade of backoff for all events in the DAG downstream from
# one event backoff upstream.
except FederationError as e:
# TODO: We should `record_event_failed_pull_attempt` here,
# see https://github.com/matrix-org/synapse/issues/13700
while True:
try:
await self._federation_event_handler.update_state_for_partial_state_event(
destination, event
)
break
except FederationPullAttemptBackoffError as e:
# We are in the backoff period for one of the event's
# prev_events. Wait it out and try again after.
logger.warning(
"%s; waiting for %d ms...", e, e.retry_after_ms
)
await self.clock.sleep(e.retry_after_ms / 1000)
# Success, no need to try the rest of the destinations.
break
except FederationError as e:
if attempt == len(destinations) - 1:
# We have tried every remote server for this event. Give up.
# TODO(faster_joins) giving up isn't the right thing to do
@ -1986,6 +1984,8 @@ class FederationHandler:
destination,
e,
)
# TODO: We should `record_event_failed_pull_attempt` here,
# see https://github.com/matrix-org/synapse/issues/13700
raise
# Try the next remote server.