From 3a569fb2000e972efe2e145d57ffd9441ee41665 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 8 Apr 2021 17:30:01 +0100 Subject: [PATCH] Fix sharded federation sender sometimes using 100% CPU. We pull all destinations requiring catchup from the DB in batches. However, if all those destinations get filtered out (due to the federation sender being sharded), then the `last_processed` destination doesn't get updated, and we keep requesting the same set repeatedly. --- changelog.d/9770.bugfix | 1 + synapse/federation/sender/__init__.py | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) create mode 100644 changelog.d/9770.bugfix diff --git a/changelog.d/9770.bugfix b/changelog.d/9770.bugfix new file mode 100644 index 000000000..baf93138d --- /dev/null +++ b/changelog.d/9770.bugfix @@ -0,0 +1 @@ +Fix bug where sharded federation senders could get stuck repeatedly querying the DB in a loop, using lots of CPU. diff --git a/synapse/federation/sender/__init__.py b/synapse/federation/sender/__init__.py index 98bfce22f..d821dcbf6 100644 --- a/synapse/federation/sender/__init__.py +++ b/synapse/federation/sender/__init__.py @@ -734,16 +734,18 @@ class FederationSender(AbstractFederationSender): self._catchup_after_startup_timer = None break + last_processed = destinations_to_wake[-1] + destinations_to_wake = [ d for d in destinations_to_wake if self._federation_shard_config.should_handle(self._instance_name, d) ] - for last_processed in destinations_to_wake: + for destination in destinations_to_wake: logger.info( "Destination %s has outstanding catch-up, waking up.", last_processed, ) - self.wake_destination(last_processed) + self.wake_destination(destination) await self.clock.sleep(CATCH_UP_STARTUP_INTERVAL_SEC)