From a9b5ea6fc1e26ff791118b67af01fdad8e9c68c8 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Mon, 18 Feb 2019 17:53:31 +0000 Subject: [PATCH 1/5] Batch cache invalidation over replication Currently whenever the current state changes in a room invalidate a lot of caches, which cause *a lot* of traffic over replication. Instead, lets batch up all those invalidations and send a single poke down the replication streams. Hopefully this will reduce load on the master process by substantially reducing traffic. --- synapse/replication/slave/storage/_base.py | 19 +++++--- synapse/storage/_base.py | 57 +++++++++++++++++++++- synapse/storage/events.py | 25 +--------- 3 files changed, 69 insertions(+), 32 deletions(-) diff --git a/synapse/replication/slave/storage/_base.py b/synapse/replication/slave/storage/_base.py index 2d81d49e9..1353a32d0 100644 --- a/synapse/replication/slave/storage/_base.py +++ b/synapse/replication/slave/storage/_base.py @@ -17,7 +17,7 @@ import logging import six -from synapse.storage._base import SQLBaseStore +from synapse.storage._base import _CURRENT_STATE_CACHE_NAME, SQLBaseStore from synapse.storage.engines import PostgresEngine from ._slaved_id_tracker import SlavedIdTracker @@ -54,12 +54,17 @@ class BaseSlavedStore(SQLBaseStore): if stream_name == "caches": self._cache_id_gen.advance(token) for row in rows: - try: - getattr(self, row.cache_func).invalidate(tuple(row.keys)) - except AttributeError: - # We probably haven't pulled in the cache in this worker, - # which is fine. - pass + if row.cache_func == _CURRENT_STATE_CACHE_NAME: + room_id = row.keys[0] + members_changed = set(row.keys[1:]) + self._invalidate_state_caches(room_id, members_changed) + else: + try: + getattr(self, row.cache_func).invalidate(tuple(row.keys)) + except AttributeError: + # We probably haven't pulled in the cache in this worker, + # which is fine. + pass def _invalidate_cache_and_stream(self, txn, cache_func, keys): txn.call_after(cache_func.invalidate, keys) diff --git a/synapse/storage/_base.py b/synapse/storage/_base.py index e12416184..f7c6d714a 100644 --- a/synapse/storage/_base.py +++ b/synapse/storage/_base.py @@ -28,6 +28,7 @@ from twisted.internet import defer from synapse.api.errors import StoreError from synapse.metrics.background_process_metrics import run_as_background_process from synapse.storage.engines import PostgresEngine, Sqlite3Engine +from synapse.types import get_domain_from_id from synapse.util.caches.descriptors import Cache from synapse.util.logcontext import LoggingContext, PreserveLoggingContext from synapse.util.stringutils import exception_to_unicode @@ -64,6 +65,10 @@ UNIQUE_INDEX_BACKGROUND_UPDATES = { "event_search": "event_search_event_id_idx", } +# This is a special cache name we use to batch multiple invalidations of caches +# based on the current state when notifying workers over replication. +_CURRENT_STATE_CACHE_NAME = "cs_cache_fake" + class LoggingTransaction(object): """An object that almost-transparently proxies for the 'txn' object @@ -1184,6 +1189,56 @@ class SQLBaseStore(object): be invalidated. """ txn.call_after(cache_func.invalidate, keys) + self._send_invalidation_to_replication(txn, cache_func.__name__, keys) + + def _invalidate_state_caches_and_stream(self, txn, room_id, members_changed): + """Special case invalidation of caches based on current state. + + We special case this so that we can batch the cache invalidations into a + single replication poke. + + Args: + txn + room_id (str): Room were state changed + members_changed (set[str]): The user_ids of members that have changed + """ + txn.call_after(self._invalidate_state_caches, room_id, members_changed) + + keys = [room_id] + keys.extend(members_changed) + self._send_invalidation_to_replication( + txn, _CURRENT_STATE_CACHE_NAME, keys, + ) + + def _invalidate_state_caches(self, room_id, members_changed): + """Invalidates caches that are based on the current state, but does + not stream invalidations down replication. + + Args: + room_id (str): Room were state changed + members_changed (set[str]): The user_ids of members that have changed + """ + for member in members_changed: + self.get_rooms_for_user_with_stream_ordering.invalidate((member,)) + + for host in set(get_domain_from_id(u) for u in members_changed): + self.is_host_joined.invalidate((room_id, host)) + self.was_host_joined.invalidate((room_id, host)) + + self.get_users_in_room.invalidate((room_id,)) + self.get_room_summary.invalidate((room_id,)) + self.get_current_state_ids.invalidate((room_id,)) + + def _send_invalidation_to_replication(self, txn, cache_name, keys): + """Notifies replication that given cache has been invalidated. + + Note that this does *not* invalidate the cache locally. + + Args: + txn + cache_name (str) + keys (list[str]) + """ if isinstance(self.database_engine, PostgresEngine): # get_next() returns a context manager which is designed to wrap @@ -1201,7 +1256,7 @@ class SQLBaseStore(object): table="cache_invalidation_stream", values={ "stream_id": stream_id, - "cache_func": cache_func.__name__, + "cache_func": cache_name, "keys": list(keys), "invalidation_ts": self.clock.time_msec(), } diff --git a/synapse/storage/events.py b/synapse/storage/events.py index 81b250480..06db9e56e 100644 --- a/synapse/storage/events.py +++ b/synapse/storage/events.py @@ -979,30 +979,7 @@ class EventsStore(StateGroupWorkerStore, EventFederationStore, EventsWorkerStore if ev_type == EventTypes.Member ) - for member in members_changed: - self._invalidate_cache_and_stream( - txn, self.get_rooms_for_user_with_stream_ordering, (member,) - ) - - for host in set(get_domain_from_id(u) for u in members_changed): - self._invalidate_cache_and_stream( - txn, self.is_host_joined, (room_id, host) - ) - self._invalidate_cache_and_stream( - txn, self.was_host_joined, (room_id, host) - ) - - self._invalidate_cache_and_stream( - txn, self.get_users_in_room, (room_id,) - ) - - self._invalidate_cache_and_stream( - txn, self.get_room_summary, (room_id,) - ) - - self._invalidate_cache_and_stream( - txn, self.get_current_state_ids, (room_id,) - ) + self._invalidate_state_caches_and_stream(txn, room_id, members_changed) def _update_forward_extremities_txn(self, txn, new_forward_extremities, max_stream_order): From 92e6fb5c89eace7aedca0cd73900d4aa44129af6 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Mon, 18 Feb 2019 17:58:17 +0000 Subject: [PATCH 2/5] Newsfile --- changelog.d/4671.misc | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/4671.misc diff --git a/changelog.d/4671.misc b/changelog.d/4671.misc new file mode 100644 index 000000000..4dc18378e --- /dev/null +++ b/changelog.d/4671.misc @@ -0,0 +1 @@ +Improve replication performance by reducing cache invalidation traffic. From bc8fa1509d3885d111a2ef98e12e9ce66a19a3a8 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 19 Feb 2019 11:24:59 +0000 Subject: [PATCH 3/5] Documentation --- docs/tcp_replication.rst | 21 ++++++++++++++++++++- synapse/storage/_base.py | 8 ++++---- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/docs/tcp_replication.rst b/docs/tcp_replication.rst index 62225ba6f..852f1113a 100644 --- a/docs/tcp_replication.rst +++ b/docs/tcp_replication.rst @@ -137,7 +137,6 @@ for each stream so that on reconneciton it can start streaming from the correct place. Note: not all RDATA have valid tokens due to batching. See ``RdataCommand`` for more details. - Example ~~~~~~~ @@ -221,3 +220,23 @@ SYNC (S, C) See ``synapse/replication/tcp/commands.py`` for a detailed description and the format of each command. + + +Cache Invalidation Stream +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The cache invalidation stream is used to inform workers when they need to +invalidate any of their caches in the data store. This is done by streaming all +cache invalidations done on master down to the workers, assuming that any caches +on the workers also exist on the master. + +Each individual cache invalidation results in a row being sent down replication, +which includes the cache name (the name of the function) and they key to +invalidate. For example:: + + > RDATA caches 550953771 ["get_user_by_id", ["@bob:example.com"], 1550574873251] + +However, there are times when a number of caches need to be invalidated at the +same time with the same key. To reduce traffic we batch those invalidations into +a single poke by defining a special cache name that workers understand to mean +to expand to invalidate the correct caches. diff --git a/synapse/storage/_base.py b/synapse/storage/_base.py index f7c6d714a..1c8d3f002 100644 --- a/synapse/storage/_base.py +++ b/synapse/storage/_base.py @@ -1199,8 +1199,8 @@ class SQLBaseStore(object): Args: txn - room_id (str): Room were state changed - members_changed (set[str]): The user_ids of members that have changed + room_id (str): Room where state changed + members_changed (Iterable[str]): The user_ids of members that have changed """ txn.call_after(self._invalidate_state_caches, room_id, members_changed) @@ -1215,7 +1215,7 @@ class SQLBaseStore(object): not stream invalidations down replication. Args: - room_id (str): Room were state changed + room_id (str): Room where state changed members_changed (set[str]): The user_ids of members that have changed """ for member in members_changed: @@ -1237,7 +1237,7 @@ class SQLBaseStore(object): Args: txn cache_name (str) - keys (list[str]) + keys (iterable[str]) """ if isinstance(self.database_engine, PostgresEngine): From 1bb35e3a83146a55bf7d8a18d38aa0d59f1289d5 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 19 Feb 2019 11:34:40 +0000 Subject: [PATCH 4/5] Use itertools --- synapse/storage/_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/synapse/storage/_base.py b/synapse/storage/_base.py index 1c8d3f002..9db594bc4 100644 --- a/synapse/storage/_base.py +++ b/synapse/storage/_base.py @@ -12,6 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import itertools import logging import sys import threading @@ -1204,8 +1205,7 @@ class SQLBaseStore(object): """ txn.call_after(self._invalidate_state_caches, room_id, members_changed) - keys = [room_id] - keys.extend(members_changed) + keys = itertools.chain([room_id], members_changed) self._send_invalidation_to_replication( txn, _CURRENT_STATE_CACHE_NAME, keys, ) From 62175a20e51b4ce71b9e7a18755a42e259bd2ff8 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 19 Feb 2019 11:38:40 +0000 Subject: [PATCH 5/5] Docs --- docs/tcp_replication.rst | 5 +++++ synapse/storage/_base.py | 5 +++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/tcp_replication.rst b/docs/tcp_replication.rst index 852f1113a..73436cea6 100644 --- a/docs/tcp_replication.rst +++ b/docs/tcp_replication.rst @@ -240,3 +240,8 @@ However, there are times when a number of caches need to be invalidated at the same time with the same key. To reduce traffic we batch those invalidations into a single poke by defining a special cache name that workers understand to mean to expand to invalidate the correct caches. + +Currently the special cache names are declared in ``synapse/storage/_base.py`` +and are: + +1. ``cs_cache_fake`` ─ invalidates caches that depend on the current state diff --git a/synapse/storage/_base.py b/synapse/storage/_base.py index 9db594bc4..f1a5366b9 100644 --- a/synapse/storage/_base.py +++ b/synapse/storage/_base.py @@ -1201,7 +1201,7 @@ class SQLBaseStore(object): Args: txn room_id (str): Room where state changed - members_changed (Iterable[str]): The user_ids of members that have changed + members_changed (iterable[str]): The user_ids of members that have changed """ txn.call_after(self._invalidate_state_caches, room_id, members_changed) @@ -1216,7 +1216,8 @@ class SQLBaseStore(object): Args: room_id (str): Room where state changed - members_changed (set[str]): The user_ids of members that have changed + members_changed (iterable[str]): The user_ids of members that have + changed """ for member in members_changed: self.get_rooms_for_user_with_stream_ordering.invalidate((member,))