Faster room joins: Resume state re-syncing after a Synapse restart (#12813)

Signed-off-by: Sean Quah <seanq@matrix.org>
2024-10-01 01:36:05 -04:00 · 2022-05-31 16:15:08 +01:00 · 2022-05-31 16:15:08 +01:00 · 641908f72f
commit 641908f72f
parent 2fba1076c5
3 changed files with 53 additions and 2 deletions
--- a/changelog.d/12813.misc
+++ b/changelog.d/12813.misc
@ -0,0 +1 @@
 Resume state re-syncing for rooms with partial state after a Synapse restart.
--- a/synapse/handlers/federation.py
+++ b/synapse/handlers/federation.py
@ -169,6 +169,14 @@ class FederationHandler:
        self.third_party_event_rules = hs.get_third_party_event_rules()
        # if this is the main process, fire off a background process to resume
        # any partial-state-resync operations which were in flight when we
        # were shut down.
        if not hs.config.worker.worker_app:
            run_as_background_process(
                "resume_sync_partial_state_room", self._resume_sync_partial_state_room
            )
    async def maybe_backfill(
        self, room_id: str, current_depth: int, limit: int
    ) -> bool:
@ -470,6 +478,8 @@ class FederationHandler:
        """
        # TODO: We should be able to call this on workers, but the upgrading of
        # room stuff after join currently doesn't work on workers.
        # TODO: Before we relax this condition, we need to allow re-syncing of
        # partial room state to happen on workers.
        assert self.config.worker.worker_app is None
        logger.debug("Joining %s to %s", joinee, room_id)
@ -550,8 +560,6 @@ class FederationHandler:
            if ret.partial_state:
                # Kick off the process of asynchronously fetching the state for this
                # room.
                #
                # TODO(faster_joins): pick this up again on restart
                run_as_background_process(
                    desc="sync_partial_state_room",
                    func=self._sync_partial_state_room,
@ -1463,6 +1471,20 @@ class FederationHandler:
        # well.
        return None
    async def _resume_sync_partial_state_room(self) -> None:
        """Resumes resyncing of all partial-state rooms after a restart."""
        assert not self.config.worker.worker_app
        partial_state_rooms = await self.store.get_partial_state_rooms_and_servers()
        for room_id, servers_in_room in partial_state_rooms.items():
            run_as_background_process(
                desc="sync_partial_state_room",
                func=self._sync_partial_state_room,
                initial_destination=None,
                other_destinations=servers_in_room,
                room_id=room_id,
            )
    async def _sync_partial_state_room(
        self,
        initial_destination: Optional[str],
@ -1477,6 +1499,7 @@ class FederationHandler:
                `initial_destination` is unavailable
            room_id: room to be resynced
        """
        assert not self.config.worker.worker_app
        # TODO(faster_joins): do we need to lock to avoid races? What happens if other
        #   worker processes kick off a resync in parallel? Perhaps we should just elect
--- a/synapse/storage/databases/main/room.py
+++ b/synapse/storage/databases/main/room.py
@ -23,6 +23,7 @@ from typing import (
    Collection,
    Dict,
    List,
    Mapping,
    Optional,
    Tuple,
    Union,
@ -1081,6 +1082,32 @@ class RoomWorkerStore(CacheInvalidationWorkerStore):
            get_rooms_for_retention_period_in_range_txn,
        )
    async def get_partial_state_rooms_and_servers(
        self,
    ) -> Mapping[str, Collection[str]]:
        """Get all rooms containing events with partial state, and the servers known
        to be in the room.
        Returns:
            A dictionary of rooms with partial state, with room IDs as keys and
            lists of servers in rooms as values.
        """
        room_servers: Dict[str, List[str]] = {}
        rows = await self.db_pool.simple_select_list(
            "partial_state_rooms_servers",
            keyvalues=None,
            retcols=("room_id", "server_name"),
            desc="get_partial_state_rooms",
        )
        for row in rows:
            room_id = row["room_id"]
            server_name = row["server_name"]
            room_servers.setdefault(room_id, []).append(server_name)
        return room_servers
    async def clear_partial_state_room(self, room_id: str) -> bool:
        # this can race with incoming events, so we watch out for FK errors.
        # TODO(faster_joins): this still doesn't completely fix the race, since the persist process
		`@ -0,0 +1 @@`
							`Resume state re-syncing for rooms with partial state after a Synapse restart.`