Merge pull request #2115 from matrix-org/erikj/dedupe_federation_repl

Reduce federation replication traffic
2024-07-19 17:42:17 +00:00 · 2017-04-12 11:07:13 +01:00 · 2017-04-12 11:07:13 +01:00 · 247c736b9b
commit 247c736b9b
parent 8fbc0d29ee 1745069543
7 changed files with 203 additions and 130 deletions
--- a/synapse/app/federation_sender.py
+++ b/synapse/app/federation_sender.py
@ -28,6 +28,7 @@ from synapse.replication.slave.storage.deviceinbox import SlavedDeviceInboxStore
 from synapse.replication.slave.storage.events import SlavedEventStore
 from synapse.replication.slave.storage.receipts import SlavedReceiptsStore
 from synapse.replication.slave.storage.registration import SlavedRegistrationStore
 from synapse.replication.slave.storage.presence import SlavedPresenceStore
 from synapse.replication.slave.storage.transactions import TransactionStore
 from synapse.replication.slave.storage.devices import SlavedDeviceStore
 from synapse.replication.tcp.client import ReplicationClientHandler
@ -55,7 +56,7 @@ logger = logging.getLogger("synapse.app.appservice")
 class FederationSenderSlaveStore(
    SlavedDeviceInboxStore, TransactionStore, SlavedReceiptsStore, SlavedEventStore,
-    SlavedRegistrationStore, SlavedDeviceStore,
+    SlavedRegistrationStore, SlavedDeviceStore, SlavedPresenceStore,
 ):
    def __init__(self, db_conn, hs):
        super(FederationSenderSlaveStore, self).__init__(db_conn, hs)
--- a/synapse/app/synchrotron.py
+++ b/synapse/app/synchrotron.py
@ -20,7 +20,7 @@ from synapse.api.constants import EventTypes
 from synapse.config._base import ConfigError
 from synapse.config.homeserver import HomeServerConfig
 from synapse.config.logger import setup_logging
-from synapse.handlers.presence import PresenceHandler
+from synapse.handlers.presence import PresenceHandler, get_interested_parties
 from synapse.http.site import SynapseSite
 from synapse.http.server import JsonResource
 from synapse.metrics.resource import MetricsResource, METRICS_PREFIX
@ -44,7 +44,7 @@ from synapse.replication.tcp.client import ReplicationClientHandler
 from synapse.server import HomeServer
 from synapse.storage.client_ips import ClientIpStore
 from synapse.storage.engines import create_engine
-from synapse.storage.presence import PresenceStore, UserPresenceState
+from synapse.storage.presence import UserPresenceState
 from synapse.storage.roommember import RoomMemberStore
 from synapse.util.httpresourcetree import create_resource_tree
 from synapse.util.logcontext import LoggingContext, PreserveLoggingContext, preserve_fn
@ -89,16 +89,6 @@ class SynchrotronSlavedStore(
        RoomMemberStore.__dict__["did_forget"]
    )
    # XXX: This is a bit broken because we don't persist the accepted list in a
    # way that can be replicated. This means that we don't have a way to
    # invalidate the cache correctly.
    get_presence_list_accepted = PresenceStore.__dict__[
        "get_presence_list_accepted"
    ]
    get_presence_list_observers_accepted = PresenceStore.__dict__[
        "get_presence_list_observers_accepted"
    ]
 UPDATE_SYNCING_USERS_MS = 10 * 1000
@ -172,7 +162,6 @@ class SynchrotronPresence(object):
    get_states = PresenceHandler.get_states.__func__
    get_state = PresenceHandler.get_state.__func__
    _get_interested_parties = PresenceHandler._get_interested_parties.__func__
    current_state_for_users = PresenceHandler.current_state_for_users.__func__
    def user_syncing(self, user_id, affect_presence):
@ -206,10 +195,8 @@ class SynchrotronPresence(object):
    @defer.inlineCallbacks
    def notify_from_replication(self, states, stream_id):
-        parties = yield self._get_interested_parties(
+        parties = yield get_interested_parties(self.store, states)
-            states, calculate_remote_hosts=False
+        room_ids_to_states, users_to_states = parties
        )
        room_ids_to_states, users_to_states, _ = parties
        self.notifier.on_new_event(
            "presence_key", stream_id, rooms=room_ids_to_states.keys(),
--- a/synapse/federation/send_queue.py
+++ b/synapse/federation/send_queue.py
@ -53,18 +53,19 @@ class FederationRemoteSendQueue(object):
        self.server_name = hs.hostname
        self.clock = hs.get_clock()
        self.notifier = hs.get_notifier()
        self.is_mine_id = hs.is_mine_id
-        self.presence_map = {}
+        self.presence_map = {}  # Pending presence map user_id -> UserPresenceState
-        self.presence_changed = sorteddict()
+        self.presence_changed = sorteddict()  # Stream position -> user_id
-        self.keyed_edu = {}
+        self.keyed_edu = {}  # (destination, key) -> EDU
-        self.keyed_edu_changed = sorteddict()
+        self.keyed_edu_changed = sorteddict()  # stream position -> (destination, key)
-        self.edus = sorteddict()
+        self.edus = sorteddict()  # stream position -> Edu
-        self.failures = sorteddict()
+        self.failures = sorteddict()  # stream position -> (destination, Failure)
-        self.device_messages = sorteddict()
+        self.device_messages = sorteddict()  # stream position -> destination
        self.pos = 1
        self.pos_time = sorteddict()
@ -120,7 +121,9 @@ class FederationRemoteSendQueue(object):
                del self.presence_changed[key]
            user_ids = set(
-                user_id for uids in self.presence_changed.values() for _, user_id in uids
+                user_id
                for uids in self.presence_changed.itervalues()
                for user_id in uids
            )
            to_del = [
@ -187,18 +190,20 @@ class FederationRemoteSendQueue(object):
        self.notifier.on_new_replication_data()
-    def send_presence(self, destination, states):
+    def send_presence(self, states):
-        """As per TransactionQueue"""
+        """As per TransactionQueue
        Args:
            states (list(UserPresenceState))
        """
        pos = self._next_pos()
-        self.presence_map.update({
+        # We only want to send presence for our own users, so lets always just
-            state.user_id: state
+        # filter here just in case.
-            for state in states
+        local_states = filter(lambda s: self.is_mine_id(s.user_id), states)
        })
-        self.presence_changed[pos] = [
+        self.presence_map.update({state.user_id: state for state in local_states})
-            (destination, state.user_id) for state in states
+        self.presence_changed[pos] = [state.user_id for state in local_states]
        ]
        self.notifier.on_new_replication_data()
@ -251,15 +256,14 @@ class FederationRemoteSendQueue(object):
        keys = self.presence_changed.keys()
        i = keys.bisect_right(from_token)
        j = keys.bisect_right(to_token) + 1
-        dest_user_ids = set(
+        dest_user_ids = [
-            (pos, dest_user_id)
+            (pos, user_id)
            for pos in keys[i:j]
-            for dest_user_id in self.presence_changed[pos]
+            for user_id in self.presence_changed[pos]
-        )
+        ]
-        for (key, (dest, user_id)) in dest_user_ids:
+        for (key, user_id) in dest_user_ids:
            rows.append((key, PresenceRow(
                destination=dest,
                state=self.presence_map[user_id],
            )))
@ -357,7 +361,6 @@ class BaseFederationRow(object):
 class PresenceRow(BaseFederationRow, namedtuple("PresenceRow", (
    "destination",  # str
    "state",  # UserPresenceState
 ))):
    TypeId = "p"
@ -365,18 +368,14 @@ class PresenceRow(BaseFederationRow, namedtuple("PresenceRow", (
    @staticmethod
    def from_data(data):
        return PresenceRow(
-            destination=data["destination"],
+            state=UserPresenceState.from_dict(data)
            state=UserPresenceState.from_dict(data["state"])
        )
    def to_data(self):
-        return {
+        return self.state.as_dict()
            "destination": self.destination,
            "state": self.state.as_dict()
        }
    def add_to_buffer(self, buff):
-        buff.presence.setdefault(self.destination, []).append(self.state)
+        buff.presence.append(self.state)
 class KeyedEduRow(BaseFederationRow, namedtuple("KeyedEduRow", (
@ -487,7 +486,7 @@ TypeToRow = {
 ParsedFederationStreamData = namedtuple("ParsedFederationStreamData", (
-    "presence",  # dict of destination -> [UserPresenceState]
+    "presence",  # list(UserPresenceState)
    "keyed_edus",  # dict of destination -> { key -> Edu }
    "edus",  # dict of destination -> [Edu]
    "failures",  # dict of destination -> [failures]
@ -509,7 +508,7 @@ def process_rows_for_federation(transaction_queue, rows):
    # them into the appropriate collection and then send them off.
    buff = ParsedFederationStreamData(
-        presence={},
+        presence=[],
        keyed_edus={},
        edus={},
        failures={},
@ -526,8 +525,8 @@ def process_rows_for_federation(transaction_queue, rows):
        parsed_row = RowType.from_data(row.data)
        parsed_row.add_to_buffer(buff)
-    for destination, states in buff.presence.iteritems():
+    if buff.presence:
-        transaction_queue.send_presence(destination, states)
+        transaction_queue.send_presence(buff.presence)
    for destination, edu_map in buff.keyed_edus.iteritems():
        for key, edu in edu_map.items():
--- a/synapse/federation/transaction_queue.py
+++ b/synapse/federation/transaction_queue.py
@ -21,11 +21,11 @@ from .units import Transaction, Edu
 from synapse.api.errors import HttpResponseException
 from synapse.util.async import run_on_reactor
-from synapse.util.logcontext import preserve_context_over_fn
+from synapse.util.logcontext import preserve_context_over_fn, preserve_fn
 from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter
 from synapse.util.metrics import measure_func
 from synapse.types import get_domain_from_id
-from synapse.handlers.presence import format_user_presence_state
+from synapse.handlers.presence import format_user_presence_state, get_interested_remotes
 import synapse.metrics
 import logging
@ -79,8 +79,18 @@ class TransactionQueue(object):
        # destination -> list of tuple(edu, deferred)
        self.pending_edus_by_dest = edus = {}
-        # Presence needs to be separate as we send single aggragate EDUs
+        # Map of user_id -> UserPresenceState for all the pending presence
        # to be sent out by user_id. Entries here get processed and put in
        # pending_presence_by_dest
        self.pending_presence = {}
        # Map of destination -> user_id -> UserPresenceState of pending presence
        # to be sent to each destinations
        self.pending_presence_by_dest = presence = {}
        # Pending EDUs by their "key". Keyed EDUs are EDUs that get clobbered
        # based on their key (e.g. typing events by room_id)
        # Map of destination -> (edu_type, key) -> Edu
        self.pending_edus_keyed_by_dest = edus_keyed = {}
        metrics.register_callback(
@ -115,6 +125,8 @@ class TransactionQueue(object):
        self._is_processing = False
        self._last_poked_id = -1
        self._processing_pending_presence = False
    def can_send_to(self, destination):
        """Can we send messages to the given server?
@ -226,17 +238,71 @@ class TransactionQueue(object):
                self._attempt_new_transaction, destination
            )
-    def send_presence(self, destination, states):
+    @preserve_fn  # the caller should not yield on this
-        if not self.can_send_to(destination):
+    @defer.inlineCallbacks
    def send_presence(self, states):
        """Send the new presence states to the appropriate destinations.
        This actually queues up the presence states ready for sending and
        triggers a background task to process them and send out the transactions.
        Args:
            states (list(UserPresenceState))
        """
        # First we queue up the new presence by user ID, so multiple presence
        # updates in quick successtion are correctly handled
        # We only want to send presence for our own users, so lets always just
        # filter here just in case.
        self.pending_presence.update({
            state.user_id: state for state in states
            if self.is_mine_id(state.user_id)
        })
        # We then handle the new pending presence in batches, first figuring
        # out the destinations we need to send each state to and then poking it
        # to attempt a new transaction. We linearize this so that we don't
        # accidentally mess up the ordering and send multiple presence updates
        # in the wrong order
        if self._processing_pending_presence:
            return
-        self.pending_presence_by_dest.setdefault(destination, {}).update({
+        self._processing_pending_presence = True
        try:
            while True:
                states_map = self.pending_presence
                self.pending_presence = {}
                if not states_map:
                    break
                yield self._process_presence_inner(states_map.values())
        finally:
            self._processing_pending_presence = False
    @measure_func("txnqueue._process_presence")
    @defer.inlineCallbacks
    def _process_presence_inner(self, states):
        """Given a list of states populate self.pending_presence_by_dest and
        poke to send a new transaction to each destination
        Args:
            states (list(UserPresenceState))
        """
        hosts_and_states = yield get_interested_remotes(self.store, states)
        for destinations, states in hosts_and_states:
            for destination in destinations:
                if not self.can_send_to(destination):
                    continue
                self.pending_presence_by_dest.setdefault(
                    destination, {}
                ).update({
                    state.user_id: state for state in states
                })
-        preserve_context_over_fn(
+                preserve_fn(self._attempt_new_transaction)(destination)
            self._attempt_new_transaction, destination
        )
    def send_edu(self, destination, edu_type, content, key=None):
        edu = Edu(
--- a/synapse/handlers/presence.py
+++ b/synapse/handlers/presence.py
@ -318,11 +318,7 @@ class PresenceHandler(object):
            if to_federation_ping:
                federation_presence_out_counter.inc_by(len(to_federation_ping))
-                _, _, hosts_to_states = yield self._get_interested_parties(
+                self._push_to_remotes(to_federation_ping.values())
                    to_federation_ping.values()
                )
                self._push_to_remotes(hosts_to_states)
    def _handle_timeouts(self):
        """Checks the presence of users that have timed out and updates as
@ -614,54 +610,6 @@ class PresenceHandler(object):
        defer.returnValue(states)
    @defer.inlineCallbacks
    def _get_interested_parties(self, states, calculate_remote_hosts=True):
        """Given a list of states return which entities (rooms, users, servers)
        are interested in the given states.
        Returns:
            3-tuple: `(room_ids_to_states, users_to_states, hosts_to_states)`,
            with each item being a dict of `entity_name` -> `[UserPresenceState]`
        """
        room_ids_to_states = {}
        users_to_states = {}
        for state in states:
            room_ids = yield self.store.get_rooms_for_user(state.user_id)
            for room_id in room_ids:
                room_ids_to_states.setdefault(room_id, []).append(state)
            plist = yield self.store.get_presence_list_observers_accepted(state.user_id)
            for u in plist:
                users_to_states.setdefault(u, []).append(state)
            # Always notify self
            users_to_states.setdefault(state.user_id, []).append(state)
        hosts_to_states = {}
        if calculate_remote_hosts:
            for room_id, states in room_ids_to_states.items():
                local_states = filter(lambda s: self.is_mine_id(s.user_id), states)
                if not local_states:
                    continue
                hosts = yield self.store.get_hosts_in_room(room_id)
                for host in hosts:
                    hosts_to_states.setdefault(host, []).extend(local_states)
        for user_id, states in users_to_states.items():
            local_states = filter(lambda s: self.is_mine_id(s.user_id), states)
            if not local_states:
                continue
            host = get_domain_from_id(user_id)
            hosts_to_states.setdefault(host, []).extend(local_states)
        # TODO: de-dup hosts_to_states, as a single host might have multiple
        # of same presence
        defer.returnValue((room_ids_to_states, users_to_states, hosts_to_states))
    @defer.inlineCallbacks
    def _persist_and_notify(self, states):
        """Persist states in the database, poke the notifier and send to
@ -669,34 +617,33 @@ class PresenceHandler(object):
        """
        stream_id, max_token = yield self.store.update_presence(states)
-        parties = yield self._get_interested_parties(states)
+        parties = yield get_interested_parties(self.store, states)
-        room_ids_to_states, users_to_states, hosts_to_states = parties
+        room_ids_to_states, users_to_states = parties
        self.notifier.on_new_event(
            "presence_key", stream_id, rooms=room_ids_to_states.keys(),
-            users=[UserID.from_string(u) for u in users_to_states.keys()]
+            users=[UserID.from_string(u) for u in users_to_states]
        )
-        self._push_to_remotes(hosts_to_states)
+        self._push_to_remotes(states)
    @defer.inlineCallbacks
    def notify_for_states(self, state, stream_id):
-        parties = yield self._get_interested_parties([state])
+        parties = yield get_interested_parties(self.store, [state])
-        room_ids_to_states, users_to_states, hosts_to_states = parties
+        room_ids_to_states, users_to_states = parties
        self.notifier.on_new_event(
            "presence_key", stream_id, rooms=room_ids_to_states.keys(),
-            users=[UserID.from_string(u) for u in users_to_states.keys()]
+            users=[UserID.from_string(u) for u in users_to_states]
        )
-    def _push_to_remotes(self, hosts_to_states):
+    def _push_to_remotes(self, states):
        """Sends state updates to remote servers.
        Args:
-            hosts_to_states (dict): Mapping `server_name` -> `[UserPresenceState]`
+            states (list(UserPresenceState))
        """
-        for host, states in hosts_to_states.items():
+        self.federation.send_presence(states)
            self.federation.send_presence(host, states)
    @defer.inlineCallbacks
    def incoming_presence(self, origin, content):
@ -837,14 +784,13 @@ class PresenceHandler(object):
        if self.is_mine(user):
            state = yield self.current_state_for_user(user.to_string())
-            hosts = set(get_domain_from_id(u) for u in user_ids)
+            self._push_to_remotes([state])
            self._push_to_remotes({host: (state,) for host in hosts})
        else:
            user_ids = filter(self.is_mine_id, user_ids)
            states = yield self.current_state_for_users(user_ids)
-            self._push_to_remotes({user.domain: states.values()})
+            self._push_to_remotes(states.values())
    @defer.inlineCallbacks
    def get_presence_list(self, observer_user, accepted=None):
@ -1344,3 +1290,66 @@ def handle_update(prev_state, new_state, is_mine, wheel_timer, now):
        persist_and_notify = True
    return new_state, persist_and_notify, federation_ping
@defer.inlineCallbacks
 def get_interested_parties(store, states):
    """Given a list of states return which entities (rooms, users)
    are interested in the given states.
    Args:
        states (list(UserPresenceState))
    Returns:
        2-tuple: `(room_ids_to_states, users_to_states)`,
        with each item being a dict of `entity_name` -> `[UserPresenceState]`
    """
    room_ids_to_states = {}
    users_to_states = {}
    for state in states:
        room_ids = yield store.get_rooms_for_user(state.user_id)
        for room_id in room_ids:
            room_ids_to_states.setdefault(room_id, []).append(state)
        plist = yield store.get_presence_list_observers_accepted(state.user_id)
        for u in plist:
            users_to_states.setdefault(u, []).append(state)
        # Always notify self
        users_to_states.setdefault(state.user_id, []).append(state)
    defer.returnValue((room_ids_to_states, users_to_states))
@defer.inlineCallbacks
 def get_interested_remotes(store, states):
    """Given a list of presence states figure out which remote servers
    should be sent which.
    All the presence states should be for local users only.
    Args:
        store (DataStore)
        states (list(UserPresenceState))
    Returns:
        Deferred list of ([destinations], [UserPresenceState]), where for
        each row the list of UserPresenceState should be sent to each
        destination
    """
    hosts_and_states = []
    # First we look up the rooms each user is in (as well as any explicit
    # subscriptions), then for each distinct room we look up the remote
    # hosts in those rooms.
    room_ids_to_states, users_to_states = yield get_interested_parties(store, states)
    for room_id, states in room_ids_to_states.iteritems():
        hosts = yield store.get_hosts_in_room(room_id)
        hosts_and_states.append((hosts, states))
    for user_id, states in users_to_states.iteritems():
        host = get_domain_from_id(user_id)
        hosts_and_states.append(([host], states))
    defer.returnValue(hosts_and_states)
--- a/synapse/replication/slave/storage/events.py
+++ b/synapse/replication/slave/storage/events.py
@ -71,6 +71,7 @@ class SlavedEventStore(BaseSlavedStore):
    # to reach inside the __dict__ to extract them.
    get_rooms_for_user = RoomMemberStore.__dict__["get_rooms_for_user"]
    get_users_in_room = RoomMemberStore.__dict__["get_users_in_room"]
    get_hosts_in_room = RoomMemberStore.__dict__["get_hosts_in_room"]
    get_users_who_share_room_with_user = (
        RoomMemberStore.__dict__["get_users_who_share_room_with_user"]
    )
--- a/synapse/replication/slave/storage/presence.py
+++ b/synapse/replication/slave/storage/presence.py
@ -39,6 +39,16 @@ class SlavedPresenceStore(BaseSlavedStore):
    _get_presence_for_user = PresenceStore.__dict__["_get_presence_for_user"]
    get_presence_for_users = PresenceStore.__dict__["get_presence_for_users"]
    # XXX: This is a bit broken because we don't persist the accepted list in a
    # way that can be replicated. This means that we don't have a way to
    # invalidate the cache correctly.
    get_presence_list_accepted = PresenceStore.__dict__[
        "get_presence_list_accepted"
    ]
    get_presence_list_observers_accepted = PresenceStore.__dict__[
        "get_presence_list_observers_accepted"
    ]
    def get_current_presence_token(self):
        return self._presence_id_gen.get_current_token()