From a4bf72c30c5953b721a64eae89db186fa8735bb3 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 29 Aug 2019 17:38:51 +0100 Subject: [PATCH 01/10] Censor redactions in DB after a month --- synapse/storage/events.py | 88 ++++++++++++++++++- .../schema/delta/56/redaction_censor.sql | 17 ++++ tests/storage/test_redaction.py | 71 +++++++++++++++ 3 files changed, 175 insertions(+), 1 deletion(-) create mode 100644 synapse/storage/schema/delta/56/redaction_censor.sql diff --git a/synapse/storage/events.py b/synapse/storage/events.py index 5a95c36a8..2970da682 100644 --- a/synapse/storage/events.py +++ b/synapse/storage/events.py @@ -23,7 +23,7 @@ from functools import wraps from six import iteritems, text_type from six.moves import range -from canonicaljson import json +from canonicaljson import encode_canonical_json, json from prometheus_client import Counter, Histogram from twisted.internet import defer @@ -33,6 +33,7 @@ from synapse.api.constants import EventTypes from synapse.api.errors import SynapseError from synapse.events import EventBase # noqa: F401 from synapse.events.snapshot import EventContext # noqa: F401 +from synapse.events.utils import prune_event_dict from synapse.logging.context import PreserveLoggingContext, make_deferred_yieldable from synapse.logging.utils import log_function from synapse.metrics import BucketCollector @@ -262,6 +263,13 @@ class EventsStore( hs.get_clock().looping_call(read_forward_extremities, 60 * 60 * 1000) + def _censor_redactions(): + return run_as_background_process( + "_censor_redactions", self._censor_redactions + ) + + hs.get_clock().looping_call(_censor_redactions, 10 * 60 * 1000) + @defer.inlineCallbacks def _read_forward_extremities(self): def fetch(txn): @@ -1548,6 +1556,84 @@ class EventsStore( (event.event_id, event.redacts), ) + @defer.inlineCallbacks + def _censor_redactions(self): + """Censors all redactions older than a month that haven't been censored. + + By censor we mean update the event_json table with the redacted event. + + Returns: + Deferred + """ + + if self.stream_ordering_month_ago is None: + return + + max_pos = self.stream_ordering_month_ago + + # We fetch all redactions that point to an event that we have that has + # a stream ordering from over a month ago, that we haven't yet censored + # in the DB. + sql = """ + SELECT er.event_id, redacts FROM redactions + INNER JOIN events AS er USING (event_id) + INNER JOIN events AS eb ON (er.room_id = eb.room_id AND redacts = eb.event_id) + WHERE NOT have_censored + AND ? <= er.stream_ordering AND er.stream_ordering <= ? + ORDER BY er.stream_ordering ASC + LIMIT ? + """ + + rows = yield self._execute( + "_censor_redactions_fetch", None, sql, -max_pos, max_pos, 100 + ) + + updates = [] + + for redaction_id, event_id in rows: + redaction_event = yield self.get_event(redaction_id, allow_none=True) + original_event = yield self.get_event( + event_id, allow_rejected=True, allow_none=True + ) + + # The SQL above ensures that we have both the redaction and + # original event, so if the `get_event` calls return None it + # means that the redaction wasn't allowed. Either way we know that + # the result won't change so we mark the fact that we've checked. + if ( + redaction_event + and original_event + and original_event.internal_metadata.is_redacted() + ): + # Redaction was allowed + pruned_json = encode_canonical_json( + prune_event_dict(original_event.get_dict()) + ) + else: + # Redaction wasn't allowed + pruned_json = None + + updates.append((redaction_id, event_id, pruned_json)) + + def _update_censor_txn(txn): + for redaction_id, event_id, pruned_json in updates: + if pruned_json: + self._simple_update_one_txn( + txn, + table="event_json", + keyvalues={"event_id": event_id}, + updatevalues={"json": pruned_json}, + ) + + self._simple_update_one_txn( + txn, + table="redactions", + keyvalues={"event_id": redaction_id}, + updatevalues={"have_censored": True}, + ) + + yield self.runInteraction("_update_censor_txn", _update_censor_txn) + @defer.inlineCallbacks def count_daily_messages(self): """ diff --git a/synapse/storage/schema/delta/56/redaction_censor.sql b/synapse/storage/schema/delta/56/redaction_censor.sql new file mode 100644 index 000000000..fe51b0230 --- /dev/null +++ b/synapse/storage/schema/delta/56/redaction_censor.sql @@ -0,0 +1,17 @@ +/* Copyright 2019 The Matrix.org Foundation C.I.C. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +ALTER TABLE redactions ADD COLUMN have_censored BOOL NOT NULL DEFAULT false; +CREATE INDEX redactions_have_censored ON redactions(event_id) WHERE not have_censored; diff --git a/tests/storage/test_redaction.py b/tests/storage/test_redaction.py index d961b81d4..0c9f3c707 100644 --- a/tests/storage/test_redaction.py +++ b/tests/storage/test_redaction.py @@ -17,6 +17,8 @@ from mock import Mock +from canonicaljson import json + from twisted.internet import defer from synapse.api.constants import EventTypes, Membership @@ -286,3 +288,72 @@ class RedactionTestCase(unittest.HomeserverTestCase): self.assertEqual( fetched.unsigned["redacted_because"].event_id, redaction_event_id2 ) + + def test_redact_censor(self): + """Test that a redacted event gets censored in the DB after a month + """ + + self.get_success( + self.inject_room_member(self.room1, self.u_alice, Membership.JOIN) + ) + + msg_event = self.get_success(self.inject_message(self.room1, self.u_alice, "t")) + + # Check event has not been redacted: + event = self.get_success(self.store.get_event(msg_event.event_id)) + + self.assertObjectHasAttributes( + { + "type": EventTypes.Message, + "user_id": self.u_alice.to_string(), + "content": {"body": "t", "msgtype": "message"}, + }, + event, + ) + + self.assertFalse("redacted_because" in event.unsigned) + + # Redact event + reason = "Because I said so" + self.get_success( + self.inject_redaction(self.room1, msg_event.event_id, self.u_alice, reason) + ) + + event = self.get_success(self.store.get_event(msg_event.event_id)) + + self.assertTrue("redacted_because" in event.unsigned) + + self.assertObjectHasAttributes( + { + "type": EventTypes.Message, + "user_id": self.u_alice.to_string(), + "content": {}, + }, + event, + ) + + event_json = self.get_success( + self.store._simple_select_one_onecol( + table="event_json", + keyvalues={"event_id": msg_event.event_id}, + retcol="json", + ) + ) + + self.assert_dict( + {"content": {"body": "t", "msgtype": "message"}}, json.loads(event_json) + ) + + # Advance by 30 days + self.reactor.advance(60 * 60 * 24 * 31) + self.reactor.advance(60 * 60 * 2) + + event_json = self.get_success( + self.store._simple_select_one_onecol( + table="event_json", + keyvalues={"event_id": msg_event.event_id}, + retcol="json", + ) + ) + + self.assert_dict({"content": {}}, json.loads(event_json)) From 549f974897ddf2fb0e5dc571c3da8034a1eb6510 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Fri, 30 Aug 2019 09:51:33 +0100 Subject: [PATCH 02/10] Newsfile --- changelog.d/5934.feature | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/5934.feature diff --git a/changelog.d/5934.feature b/changelog.d/5934.feature new file mode 100644 index 000000000..eae969a52 --- /dev/null +++ b/changelog.d/5934.feature @@ -0,0 +1 @@ +Redact events in the database that have been redacted for a month. From 3ff0422d2dbfa668df365da99a4b7caeea85528d Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 5 Sep 2019 17:16:03 +0100 Subject: [PATCH 03/10] Make redaction retention period configurable --- docs/sample_config.yaml | 5 +++++ synapse/config/server.py | 15 +++++++++++++++ synapse/storage/events.py | 6 ++++-- tests/storage/test_redaction.py | 4 +++- 4 files changed, 27 insertions(+), 3 deletions(-) diff --git a/docs/sample_config.yaml b/docs/sample_config.yaml index 43969bbb7..e23b80d2b 100644 --- a/docs/sample_config.yaml +++ b/docs/sample_config.yaml @@ -306,6 +306,11 @@ listeners: # #allow_per_room_profiles: false +# How long to keep redacted events in unredacted form in the database. +# By default redactions are kept indefinitely. +# +#redaction_retention_period: 30d + ## TLS ## diff --git a/synapse/config/server.py b/synapse/config/server.py index 2abdef097..8efab924d 100644 --- a/synapse/config/server.py +++ b/synapse/config/server.py @@ -162,6 +162,16 @@ class ServerConfig(Config): self.mau_trial_days = config.get("mau_trial_days", 0) + # How long to keep redacted events in the database in unredacted form + # before redacting them. + redaction_retention_period = config.get("redaction_retention_period") + if redaction_retention_period: + self.redaction_retention_period = self.parse_duration( + redaction_retention_period + ) + else: + self.redaction_retention_period = None + # Options to disable HS self.hs_disabled = config.get("hs_disabled", False) self.hs_disabled_message = config.get("hs_disabled_message", "") @@ -718,6 +728,11 @@ class ServerConfig(Config): # Defaults to 'true'. # #allow_per_room_profiles: false + + # How long to keep redacted events in unredacted form in the database. + # By default redactions are kept indefinitely. + # + #redaction_retention_period: 30d """ % locals() ) diff --git a/synapse/storage/events.py b/synapse/storage/events.py index 2970da682..d0d1781c9 100644 --- a/synapse/storage/events.py +++ b/synapse/storage/events.py @@ -1566,10 +1566,12 @@ class EventsStore( Deferred """ - if self.stream_ordering_month_ago is None: + if not self.hs.config.redaction_retention_period: return - max_pos = self.stream_ordering_month_ago + max_pos = yield self.find_first_stream_ordering_after_ts( + self._clock.time_msec() - self.hs.config.redaction_retention_period + ) # We fetch all redactions that point to an event that we have that has # a stream ordering from over a month ago, that we haven't yet censored diff --git a/tests/storage/test_redaction.py b/tests/storage/test_redaction.py index 0c9f3c707..f0e86d41a 100644 --- a/tests/storage/test_redaction.py +++ b/tests/storage/test_redaction.py @@ -344,7 +344,9 @@ class RedactionTestCase(unittest.HomeserverTestCase): {"content": {"body": "t", "msgtype": "message"}}, json.loads(event_json) ) - # Advance by 30 days + # Advance by 30 days, then advance again to ensure that the looping call + # for updating the stream position gets called and then the looping call + # for the censoring gets called. self.reactor.advance(60 * 60 * 24 * 31) self.reactor.advance(60 * 60 * 2) From ad9b64b4969537ac339469152eaa437bcf4b6609 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 5 Sep 2019 17:17:47 +0100 Subject: [PATCH 04/10] Fix test --- tests/storage/test_redaction.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/storage/test_redaction.py b/tests/storage/test_redaction.py index f0e86d41a..deecfad9f 100644 --- a/tests/storage/test_redaction.py +++ b/tests/storage/test_redaction.py @@ -31,8 +31,10 @@ from tests.utils import create_room class RedactionTestCase(unittest.HomeserverTestCase): def make_homeserver(self, reactor, clock): + config = self.default_config() + config["redaction_retention_period"] = "30d" return self.setup_test_homeserver( - resource_for_federation=Mock(), http_client=None + resource_for_federation=Mock(), http_client=None, config=config ) def prepare(self, reactor, clock, hs): From 80e14a8546efb9e2f9edec3b1de0a8b943351252 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Mon, 9 Sep 2019 13:23:41 +0100 Subject: [PATCH 05/10] Handle setting retention period to 0 --- synapse/config/server.py | 2 +- synapse/storage/events.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/synapse/config/server.py b/synapse/config/server.py index 8efab924d..aa71835dc 100644 --- a/synapse/config/server.py +++ b/synapse/config/server.py @@ -165,7 +165,7 @@ class ServerConfig(Config): # How long to keep redacted events in the database in unredacted form # before redacting them. redaction_retention_period = config.get("redaction_retention_period") - if redaction_retention_period: + if redaction_retention_period is not None: self.redaction_retention_period = self.parse_duration( redaction_retention_period ) diff --git a/synapse/storage/events.py b/synapse/storage/events.py index a5d13ddc4..77ba7eb2a 100644 --- a/synapse/storage/events.py +++ b/synapse/storage/events.py @@ -1566,7 +1566,7 @@ class EventsStore( Deferred """ - if not self.hs.config.redaction_retention_period: + if self.hs.config.redaction_retention_period is None: return max_pos = yield self.find_first_stream_ordering_after_ts( From fffe17b77d06927aaf64fa80be5b765c870a4ef5 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Mon, 9 Sep 2019 13:24:24 +0100 Subject: [PATCH 06/10] Don't start looping call unless enabled --- synapse/storage/events.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/synapse/storage/events.py b/synapse/storage/events.py index 77ba7eb2a..9ef7aefd9 100644 --- a/synapse/storage/events.py +++ b/synapse/storage/events.py @@ -268,7 +268,8 @@ class EventsStore( "_censor_redactions", self._censor_redactions ) - hs.get_clock().looping_call(_censor_redactions, 10 * 60 * 1000) + if self.hs.config.redaction_retention_period is not None: + hs.get_clock().looping_call(_censor_redactions, 10 * 60 * 1000) @defer.inlineCallbacks def _read_forward_extremities(self): From 916c69722833dd94c53d0fedeec8cc42d2085e73 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Mon, 9 Sep 2019 13:31:00 +0100 Subject: [PATCH 07/10] Fixup comment --- synapse/storage/events.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/synapse/storage/events.py b/synapse/storage/events.py index 9ef7aefd9..4484ae7ce 100644 --- a/synapse/storage/events.py +++ b/synapse/storage/events.py @@ -269,7 +269,7 @@ class EventsStore( ) if self.hs.config.redaction_retention_period is not None: - hs.get_clock().looping_call(_censor_redactions, 10 * 60 * 1000) + hs.get_clock().looping_call(_censor_redactions, 5 * 60 * 1000) @defer.inlineCallbacks def _read_forward_extremities(self): @@ -1574,9 +1574,17 @@ class EventsStore( self._clock.time_msec() - self.hs.config.redaction_retention_period ) - # We fetch all redactions that point to an event that we have that has - # a stream ordering from over a month ago, that we haven't yet censored - # in the DB. + # We fetch all redactions that: + # 1. point to an event we have that has, + # 2. has a stream ordering from before the cut off, and + # 3. we haven't yet censored. + # + # This is limited to 100 events to ensure that we don't try and do too + # much at once. We'll get called again so this should eventually catch + # up. + # + # We use the range [-max_pos, max_pos] to handle backfilled events, + # which are given negative stream ordering. sql = """ SELECT er.event_id, redacts FROM redactions INNER JOIN events AS er USING (event_id) From e7184a437062ae21846b8e071ded73526209e90c Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Mon, 9 Sep 2019 13:33:38 +0100 Subject: [PATCH 08/10] Use better names in SQL --- synapse/storage/events.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/synapse/storage/events.py b/synapse/storage/events.py index 4484ae7ce..0da6e0b1a 100644 --- a/synapse/storage/events.py +++ b/synapse/storage/events.py @@ -1586,12 +1586,15 @@ class EventsStore( # We use the range [-max_pos, max_pos] to handle backfilled events, # which are given negative stream ordering. sql = """ - SELECT er.event_id, redacts FROM redactions - INNER JOIN events AS er USING (event_id) - INNER JOIN events AS eb ON (er.room_id = eb.room_id AND redacts = eb.event_id) + SELECT redact_event.event_id, redacts FROM redactions + INNER JOIN events AS redact_event USING (event_id) + INNER JOIN events AS original_event ON ( + redact_event.room_id = original_event.room_id + AND redacts = original_event.event_id + ) WHERE NOT have_censored - AND ? <= er.stream_ordering AND er.stream_ordering <= ? - ORDER BY er.stream_ordering ASC + AND ? <= redact_event.stream_ordering AND redact_event.stream_ordering <= ? + ORDER BY redact_event.stream_ordering ASC LIMIT ? """ From 8b9ade8c7871c862cf2122a156f00e411cd7a276 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Mon, 9 Sep 2019 13:40:05 +0100 Subject: [PATCH 09/10] Default to censoring redactions after seven days --- docs/sample_config.yaml | 8 +++++--- synapse/config/server.py | 10 ++++++---- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/docs/sample_config.yaml b/docs/sample_config.yaml index e23b80d2b..24adc3da2 100644 --- a/docs/sample_config.yaml +++ b/docs/sample_config.yaml @@ -306,10 +306,12 @@ listeners: # #allow_per_room_profiles: false -# How long to keep redacted events in unredacted form in the database. -# By default redactions are kept indefinitely. +# How long to keep redacted events in unredacted form in the database. After +# this period redacted events get replaced with their redacted form in the DB. # -#redaction_retention_period: 30d +# Defaults to `7d`. Set to `null` to disable. +# +redaction_retention_period: 7d ## TLS ## diff --git a/synapse/config/server.py b/synapse/config/server.py index aa71835dc..c8b9fe2d0 100644 --- a/synapse/config/server.py +++ b/synapse/config/server.py @@ -164,7 +164,7 @@ class ServerConfig(Config): # How long to keep redacted events in the database in unredacted form # before redacting them. - redaction_retention_period = config.get("redaction_retention_period") + redaction_retention_period = config.get("redaction_retention_period", "7d") if redaction_retention_period is not None: self.redaction_retention_period = self.parse_duration( redaction_retention_period @@ -729,10 +729,12 @@ class ServerConfig(Config): # #allow_per_room_profiles: false - # How long to keep redacted events in unredacted form in the database. - # By default redactions are kept indefinitely. + # How long to keep redacted events in unredacted form in the database. After + # this period redacted events get replaced with their redacted form in the DB. # - #redaction_retention_period: 30d + # Defaults to `7d`. Set to `null` to disable. + # + redaction_retention_period: 7d """ % locals() ) From 580f3df9b2573c0278dd952d1478689e5cd23a7b Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Mon, 9 Sep 2019 15:08:24 +0100 Subject: [PATCH 10/10] Fix comments --- synapse/storage/events.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/synapse/storage/events.py b/synapse/storage/events.py index 0da6e0b1a..ddf7ab647 100644 --- a/synapse/storage/events.py +++ b/synapse/storage/events.py @@ -1559,7 +1559,8 @@ class EventsStore( @defer.inlineCallbacks def _censor_redactions(self): - """Censors all redactions older than a month that haven't been censored. + """Censors all redactions older than the configured period that haven't + been censored yet. By censor we mean update the event_json table with the redacted event. @@ -1575,7 +1576,7 @@ class EventsStore( ) # We fetch all redactions that: - # 1. point to an event we have that has, + # 1. point to an event we have, # 2. has a stream ordering from before the cut off, and # 3. we haven't yet censored. #