From 641908f72f94357049eca1cab632918d252da3e0 Mon Sep 17 00:00:00 2001 From: Sean Quah <8349537+squahtx@users.noreply.github.com> Date: Tue, 31 May 2022 16:15:08 +0100 Subject: [PATCH] Faster room joins: Resume state re-syncing after a Synapse restart (#12813) Signed-off-by: Sean Quah --- changelog.d/12813.misc | 1 + synapse/handlers/federation.py | 27 ++++++++++++++++++++++++-- synapse/storage/databases/main/room.py | 27 ++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 2 deletions(-) create mode 100644 changelog.d/12813.misc diff --git a/changelog.d/12813.misc b/changelog.d/12813.misc new file mode 100644 index 000000000..8be9f3eb4 --- /dev/null +++ b/changelog.d/12813.misc @@ -0,0 +1 @@ +Resume state re-syncing for rooms with partial state after a Synapse restart. diff --git a/synapse/handlers/federation.py b/synapse/handlers/federation.py index b4b63a342..659f27944 100644 --- a/synapse/handlers/federation.py +++ b/synapse/handlers/federation.py @@ -169,6 +169,14 @@ class FederationHandler: self.third_party_event_rules = hs.get_third_party_event_rules() + # if this is the main process, fire off a background process to resume + # any partial-state-resync operations which were in flight when we + # were shut down. + if not hs.config.worker.worker_app: + run_as_background_process( + "resume_sync_partial_state_room", self._resume_sync_partial_state_room + ) + async def maybe_backfill( self, room_id: str, current_depth: int, limit: int ) -> bool: @@ -470,6 +478,8 @@ class FederationHandler: """ # TODO: We should be able to call this on workers, but the upgrading of # room stuff after join currently doesn't work on workers. + # TODO: Before we relax this condition, we need to allow re-syncing of + # partial room state to happen on workers. assert self.config.worker.worker_app is None logger.debug("Joining %s to %s", joinee, room_id) @@ -550,8 +560,6 @@ class FederationHandler: if ret.partial_state: # Kick off the process of asynchronously fetching the state for this # room. - # - # TODO(faster_joins): pick this up again on restart run_as_background_process( desc="sync_partial_state_room", func=self._sync_partial_state_room, @@ -1463,6 +1471,20 @@ class FederationHandler: # well. return None + async def _resume_sync_partial_state_room(self) -> None: + """Resumes resyncing of all partial-state rooms after a restart.""" + assert not self.config.worker.worker_app + + partial_state_rooms = await self.store.get_partial_state_rooms_and_servers() + for room_id, servers_in_room in partial_state_rooms.items(): + run_as_background_process( + desc="sync_partial_state_room", + func=self._sync_partial_state_room, + initial_destination=None, + other_destinations=servers_in_room, + room_id=room_id, + ) + async def _sync_partial_state_room( self, initial_destination: Optional[str], @@ -1477,6 +1499,7 @@ class FederationHandler: `initial_destination` is unavailable room_id: room to be resynced """ + assert not self.config.worker.worker_app # TODO(faster_joins): do we need to lock to avoid races? What happens if other # worker processes kick off a resync in parallel? Perhaps we should just elect diff --git a/synapse/storage/databases/main/room.py b/synapse/storage/databases/main/room.py index 10f2ceb50..cfd8ce162 100644 --- a/synapse/storage/databases/main/room.py +++ b/synapse/storage/databases/main/room.py @@ -23,6 +23,7 @@ from typing import ( Collection, Dict, List, + Mapping, Optional, Tuple, Union, @@ -1081,6 +1082,32 @@ class RoomWorkerStore(CacheInvalidationWorkerStore): get_rooms_for_retention_period_in_range_txn, ) + async def get_partial_state_rooms_and_servers( + self, + ) -> Mapping[str, Collection[str]]: + """Get all rooms containing events with partial state, and the servers known + to be in the room. + + Returns: + A dictionary of rooms with partial state, with room IDs as keys and + lists of servers in rooms as values. + """ + room_servers: Dict[str, List[str]] = {} + + rows = await self.db_pool.simple_select_list( + "partial_state_rooms_servers", + keyvalues=None, + retcols=("room_id", "server_name"), + desc="get_partial_state_rooms", + ) + + for row in rows: + room_id = row["room_id"] + server_name = row["server_name"] + room_servers.setdefault(room_id, []).append(server_name) + + return room_servers + async def clear_partial_state_room(self, room_id: str) -> bool: # this can race with incoming events, so we watch out for FK errors. # TODO(faster_joins): this still doesn't completely fix the race, since the persist process