mirror of
https://git.anonymousland.org/anonymousland/synapse.git
synced 2025-08-09 10:22:18 -04:00
Fix spinloop during partial state sync when a prev event is in backoff (#15351)
Previously, we would spin in a tight loop until `update_state_for_partial_state_event` stopped raising `FederationPullAttemptBackoffError`s. Replace the spinloop with a wait until the backoff period has expired. Signed-off-by: Sean Quah <seanq@matrix.org>
This commit is contained in:
parent
a3bad89d57
commit
d9f694932c
6 changed files with 80 additions and 48 deletions
|
@ -1949,27 +1949,25 @@ class FederationHandler:
|
|||
)
|
||||
for event in events:
|
||||
for attempt in itertools.count():
|
||||
# We try a new destination on every iteration.
|
||||
try:
|
||||
await self._federation_event_handler.update_state_for_partial_state_event(
|
||||
destination, event
|
||||
)
|
||||
break
|
||||
except FederationPullAttemptBackoffError as exc:
|
||||
# Log a warning about why we failed to process the event (the error message
|
||||
# for `FederationPullAttemptBackoffError` is pretty good)
|
||||
logger.warning("_sync_partial_state_room: %s", exc)
|
||||
# We do not record a failed pull attempt when we backoff fetching a missing
|
||||
# `prev_event` because not being able to fetch the `prev_events` just means
|
||||
# we won't be able to de-outlier the pulled event. But we can still use an
|
||||
# `outlier` in the state/auth chain for another event. So we shouldn't stop
|
||||
# a downstream event from trying to pull it.
|
||||
#
|
||||
# This avoids a cascade of backoff for all events in the DAG downstream from
|
||||
# one event backoff upstream.
|
||||
except FederationError as e:
|
||||
# TODO: We should `record_event_failed_pull_attempt` here,
|
||||
# see https://github.com/matrix-org/synapse/issues/13700
|
||||
while True:
|
||||
try:
|
||||
await self._federation_event_handler.update_state_for_partial_state_event(
|
||||
destination, event
|
||||
)
|
||||
break
|
||||
except FederationPullAttemptBackoffError as e:
|
||||
# We are in the backoff period for one of the event's
|
||||
# prev_events. Wait it out and try again after.
|
||||
logger.warning(
|
||||
"%s; waiting for %d ms...", e, e.retry_after_ms
|
||||
)
|
||||
await self.clock.sleep(e.retry_after_ms / 1000)
|
||||
|
||||
# Success, no need to try the rest of the destinations.
|
||||
break
|
||||
except FederationError as e:
|
||||
if attempt == len(destinations) - 1:
|
||||
# We have tried every remote server for this event. Give up.
|
||||
# TODO(faster_joins) giving up isn't the right thing to do
|
||||
|
@ -1986,6 +1984,8 @@ class FederationHandler:
|
|||
destination,
|
||||
e,
|
||||
)
|
||||
# TODO: We should `record_event_failed_pull_attempt` here,
|
||||
# see https://github.com/matrix-org/synapse/issues/13700
|
||||
raise
|
||||
|
||||
# Try the next remote server.
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue