Implement per-room message retention policies

2025-05-04 18:44:56 -04:00 · 2019-11-04 17:09:22 +00:00 · 2019-11-04 17:09:22 +00:00 · 09957ce0e4
commit 09957ce0e4
parent f496d25877
13 changed files with 1074 additions and 6 deletions
--- a/synapse/handlers/pagination.py
+++ b/synapse/handlers/pagination.py
@ -15,12 +15,15 @@
 # limitations under the License.
 import logging

+from six import iteritems
+
 from twisted.internet import defer
 from twisted.python.failure import Failure

 from synapse.api.constants import EventTypes, Membership
 from synapse.api.errors import SynapseError
 from synapse.logging.context import run_in_background
+from synapse.metrics.background_process_metrics import run_as_background_process
 from synapse.storage.state import StateFilter
 from synapse.types import RoomStreamToken
 from synapse.util.async_helpers import ReadWriteLock
@ -80,6 +83,114 @@ class PaginationHandler(object):
        self._purges_by_id = {}
        self._event_serializer = hs.get_event_client_serializer()

+        self._retention_default_max_lifetime = hs.config.retention_default_max_lifetime
+
+        if hs.config.retention_enabled:
+            # Run the purge jobs described in the configuration file.
+            for job in hs.config.retention_purge_jobs:
+                self.clock.looping_call(
+                    run_as_background_process,
+                    job["interval"],
+                    "purge_history_for_rooms_in_range",
+                    self.purge_history_for_rooms_in_range,
+                    job["shortest_max_lifetime"],
+                    job["longest_max_lifetime"],
+                )
+
+    @defer.inlineCallbacks
+    def purge_history_for_rooms_in_range(self, min_ms, max_ms):
+        """Purge outdated events from rooms within the given retention range.
+
+        If a default retention policy is defined in the server's configuration and its
+        'max_lifetime' is within this range, also targets rooms which don't have a
+        retention policy.
+
+        Args:
+            min_ms (int|None): Duration in milliseconds that define the lower limit of
+                the range to handle (exclusive). If None, it means that the range has no
+                lower limit.
+            max_ms (int|None): Duration in milliseconds that define the upper limit of
+                the range to handle (inclusive). If None, it means that the range has no
+                upper limit.
+        """
+        # We want the storage layer to to include rooms with no retention policy in its
+        # return value only if a default retention policy is defined in the server's
+        # configuration and that policy's 'max_lifetime' is either lower (or equal) than
+        # max_ms or higher than min_ms (or both).
+        if self._retention_default_max_lifetime is not None:
+            include_null = True
+
+            if min_ms is not None and min_ms >= self._retention_default_max_lifetime:
+                # The default max_lifetime is lower than (or equal to) min_ms.
+                include_null = False
+
+            if max_ms is not None and max_ms < self._retention_default_max_lifetime:
+                # The default max_lifetime is higher than max_ms.
+                include_null = False
+        else:
+            include_null = False
+
+        rooms = yield self.store.get_rooms_for_retention_period_in_range(
+            min_ms, max_ms, include_null
+        )
+
+        for room_id, retention_policy in iteritems(rooms):
+            if room_id in self._purges_in_progress_by_room:
+                logger.warning(
+                    "[purge] not purging room %s as there's an ongoing purge running"
+                    " for this room",
+                    room_id,
+                )
+                continue
+
+            max_lifetime = retention_policy["max_lifetime"]
+
+            if max_lifetime is None:
+                # If max_lifetime is None, it means that include_null equals True,
+                # therefore we can safely assume that there is a default policy defined
+                # in the server's configuration.
+                max_lifetime = self._retention_default_max_lifetime
+
+            # Figure out what token we should start purging at.
+            ts = self.clock.time_msec() - max_lifetime
+
+            stream_ordering = (
+                yield self.store.find_first_stream_ordering_after_ts(ts)
+            )
+
+            r = (
+                yield self.store.get_room_event_after_stream_ordering(
+                    room_id, stream_ordering,
+                )
+            )
+            if not r:
+                logger.warning(
+                    "[purge] purging events not possible: No event found "
+                    "(ts %i => stream_ordering %i)",
+                    ts, stream_ordering,
+                )
+                continue
+
+            (stream, topo, _event_id) = r
+            token = "t%d-%d" % (topo, stream)
+
+            purge_id = random_string(16)
+
+            self._purges_by_id[purge_id] = PurgeStatus()
+
+            logger.info(
+                "Starting purging events in room %s (purge_id %s)" % (room_id, purge_id)
+            )
+
+            # We want to purge everything, including local events, and to run the purge in
+            # the background so that it's not blocking any other operation apart from
+            # other purges in the same room.
+            run_as_background_process(
+                "_purge_history",
+                self._purge_history,
+                purge_id, room_id, token, True,
+            )
+
    def start_purge_history(self, room_id, token, delete_local_events=False):
        """Start off a history purge on a room.