add a cache to have_seen_event (#9953)

Empirically, this helped my server considerably when handling gaps in Matrix HQ. The problem was that we would repeatedly call have_seen_events for the same set of (50K or so) auth_events, each of which would take many minutes to complete, even though it's only an index scan.
This commit is contained in:
Richard van der Hoff 2021-06-01 12:04:47 +01:00 committed by GitHub
parent 10e6d2abce
commit b4b2fd2ece
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 206 additions and 21 deletions

View file

@ -22,6 +22,7 @@ from typing import (
Iterable,
List,
Optional,
Set,
Tuple,
overload,
)
@ -55,7 +56,7 @@ from synapse.storage.engines import PostgresEngine
from synapse.storage.util.id_generators import MultiWriterIdGenerator, StreamIdGenerator
from synapse.storage.util.sequence import build_sequence_generator
from synapse.types import JsonDict, get_domain_from_id
from synapse.util.caches.descriptors import cached
from synapse.util.caches.descriptors import cached, cachedList
from synapse.util.caches.lrucache import LruCache
from synapse.util.iterutils import batch_iter
from synapse.util.metrics import Measure
@ -1045,32 +1046,74 @@ class EventsWorkerStore(SQLBaseStore):
return {r["event_id"] for r in rows}
async def have_seen_events(self, event_ids):
async def have_seen_events(
self, room_id: str, event_ids: Iterable[str]
) -> Set[str]:
"""Given a list of event ids, check if we have already processed them.
The room_id is only used to structure the cache (so that it can later be
invalidated by room_id) - there is no guarantee that the events are actually
in the room in question.
Args:
event_ids (iterable[str]):
room_id: Room we are polling
event_ids: events we are looking for
Returns:
set[str]: The events we have already seen.
"""
# if the event cache contains the event, obviously we've seen it.
results = {x for x in event_ids if self._get_event_cache.contains(x)}
res = await self._have_seen_events_dict(
(room_id, event_id) for event_id in event_ids
)
return {eid for ((_rid, eid), have_event) in res.items() if have_event}
def have_seen_events_txn(txn, chunk):
sql = "SELECT event_id FROM events as e WHERE "
@cachedList("have_seen_event", "keys")
async def _have_seen_events_dict(
self, keys: Iterable[Tuple[str, str]]
) -> Dict[Tuple[str, str], bool]:
"""Helper for have_seen_events
Returns:
a dict {(room_id, event_id)-> bool}
"""
# if the event cache contains the event, obviously we've seen it.
cache_results = {
(rid, eid) for (rid, eid) in keys if self._get_event_cache.contains((eid,))
}
results = {x: True for x in cache_results}
def have_seen_events_txn(txn, chunk: Tuple[Tuple[str, str], ...]):
# we deliberately do *not* query the database for room_id, to make the
# query an index-only lookup on `events_event_id_key`.
#
# We therefore pull the events from the database into a set...
sql = "SELECT event_id FROM events AS e WHERE "
clause, args = make_in_list_sql_clause(
txn.database_engine, "e.event_id", chunk
txn.database_engine, "e.event_id", [eid for (_rid, eid) in chunk]
)
txn.execute(sql + clause, args)
results.update(row[0] for row in txn)
found_events = {eid for eid, in txn}
for chunk in batch_iter((x for x in event_ids if x not in results), 100):
# ... and then we can update the results for each row in the batch
results.update({(rid, eid): (eid in found_events) for (rid, eid) in chunk})
# each batch requires its own index scan, so we make the batches as big as
# possible.
for chunk in batch_iter((k for k in keys if k not in cache_results), 500):
await self.db_pool.runInteraction(
"have_seen_events", have_seen_events_txn, chunk
)
return results
@cached(max_entries=100000, tree=True)
async def have_seen_event(self, room_id: str, event_id: str):
# this only exists for the benefit of the @cachedList descriptor on
# _have_seen_events_dict
raise NotImplementedError()
def _get_current_state_event_counts_txn(self, txn, room_id):
"""
See get_current_state_event_counts.