add a cache to have_seen_event (#9953)

Empirically, this helped my server considerably when handling gaps in Matrix HQ. The problem was that we would repeatedly call have_seen_events for the same set of (50K or so) auth_events, each of which would take many minutes to complete, even though it's only an index scan.
2025-08-07 08:52:15 -04:00 · 2021-06-01 12:04:47 +01:00 · 2021-06-01 12:04:47 +01:00 · b4b2fd2ece
commit b4b2fd2ece
parent 10e6d2abce
10 changed files with 206 additions and 21 deletions
--- a/synapse/storage/databases/main/purge_events.py
+++ b/synapse/storage/databases/main/purge_events.py
@ -16,14 +16,14 @@ import logging
 from typing import Any, List, Set, Tuple

 from synapse.api.errors import SynapseError
-from synapse.storage._base import SQLBaseStore
+from synapse.storage.databases.main import CacheInvalidationWorkerStore
 from synapse.storage.databases.main.state import StateGroupWorkerStore
 from synapse.types import RoomStreamToken

 logger = logging.getLogger(__name__)


-class PurgeEventsStore(StateGroupWorkerStore, SQLBaseStore):
+class PurgeEventsStore(StateGroupWorkerStore, CacheInvalidationWorkerStore):
    async def purge_history(
        self, room_id: str, token: str, delete_local_events: bool
    ) -> Set[int]:
@ -203,8 +203,6 @@ class PurgeEventsStore(StateGroupWorkerStore, SQLBaseStore):
            "DELETE FROM event_to_state_groups "
            "WHERE event_id IN (SELECT event_id from events_to_purge)"
        )
-        for event_id, _ in event_rows:
-            txn.call_after(self._get_state_group_for_event.invalidate, (event_id,))

        # Delete all remote non-state events
        for table in (
@ -283,6 +281,20 @@ class PurgeEventsStore(StateGroupWorkerStore, SQLBaseStore):
        # so make sure to keep this actually last.
        txn.execute("DROP TABLE events_to_purge")

+        for event_id, should_delete in event_rows:
+            self._invalidate_cache_and_stream(
+                txn, self._get_state_group_for_event, (event_id,)
+            )
+
+            # XXX: This is racy, since have_seen_events could be called between the
+            #    transaction completing and the invalidation running. On the other hand,
+            #    that's no different to calling `have_seen_events` just before the
+            #    event is deleted from the database.
+            if should_delete:
+                self._invalidate_cache_and_stream(
+                    txn, self.have_seen_event, (room_id, event_id)
+                )
+
        logger.info("[purge] done")

        return referenced_state_groups
@ -422,7 +434,11 @@ class PurgeEventsStore(StateGroupWorkerStore, SQLBaseStore):
        #       index on them. In any case we should be clearing out 'stream' tables
        #       periodically anyway (#5888)

-        # TODO: we could probably usefully do a bunch of cache invalidation here
+        # TODO: we could probably usefully do a bunch more cache invalidation here
+
+        # XXX: as with purge_history, this is racy, but no worse than other races
+        #   that already exist.
+        self._invalidate_cache_and_stream(txn, self.have_seen_event, (room_id,))

        logger.info("[purge] done")