From 230a831c734246aa4db7bd842947c7ea277ca126 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 1 Feb 2023 15:45:10 -0500 Subject: [PATCH] Attempt to delete more duplicate rows in receipts_linearized table. (#14915) The previous assumption was that the stream_id column was unique (for a room ID, receipt type, user ID tuple), but this turned out to be incorrect. Now find the max stream ID, then map this back to a database-specific row identifier and delete other rows which match the (room ID, receipt type, user ID) tuple, but *not* the row ID. --- changelog.d/14915.bugfix | 1 + synapse/storage/databases/main/receipts.py | 34 ++++++++++++++----- tests/storage/databases/main/test_receipts.py | 4 ++- 3 files changed, 30 insertions(+), 9 deletions(-) create mode 100644 changelog.d/14915.bugfix diff --git a/changelog.d/14915.bugfix b/changelog.d/14915.bugfix new file mode 100644 index 000000000..4969e5450 --- /dev/null +++ b/changelog.d/14915.bugfix @@ -0,0 +1 @@ +Fix a bug introduced in Synapse 1.70.0 where the background updates to add non-thread unique indexes on receipts could fail when upgrading from 1.67.0 or earlier. diff --git a/synapse/storage/databases/main/receipts.py b/synapse/storage/databases/main/receipts.py index 3468f354e..29972d520 100644 --- a/synapse/storage/databases/main/receipts.py +++ b/synapse/storage/databases/main/receipts.py @@ -941,10 +941,14 @@ class ReceiptsBackgroundUpdateStore(SQLBaseStore): receipts.""" def _remote_duplicate_receipts_txn(txn: LoggingTransaction) -> None: + if isinstance(self.database_engine, PostgresEngine): + ROW_ID_NAME = "ctid" + else: + ROW_ID_NAME = "rowid" + # Identify any duplicate receipts arising from # https://github.com/matrix-org/synapse/issues/14406. - # We expect the following query to use the per-thread receipt index and take - # less than a minute. + # The following query takes less than a minute on matrix.org. sql = """ SELECT MAX(stream_id), room_id, receipt_type, user_id FROM receipts_linearized @@ -956,19 +960,33 @@ class ReceiptsBackgroundUpdateStore(SQLBaseStore): duplicate_keys = cast(List[Tuple[int, str, str, str]], list(txn)) # Then remove duplicate receipts, keeping the one with the highest - # `stream_id`. There should only be a single receipt with any given - # `stream_id`. - for max_stream_id, room_id, receipt_type, user_id in duplicate_keys: - sql = """ + # `stream_id`. Since there might be duplicate rows with the same + # `stream_id`, we delete by the ctid instead. + for stream_id, room_id, receipt_type, user_id in duplicate_keys: + sql = f""" + SELECT {ROW_ID_NAME} + FROM receipts_linearized + WHERE + room_id = ? AND + receipt_type = ? AND + user_id = ? AND + thread_id IS NULL AND + stream_id = ? + LIMIT 1 + """ + txn.execute(sql, (room_id, receipt_type, user_id, stream_id)) + row_id = cast(Tuple[str], txn.fetchone())[0] + + sql = f""" DELETE FROM receipts_linearized WHERE room_id = ? AND receipt_type = ? AND user_id = ? AND thread_id IS NULL AND - stream_id < ? + {ROW_ID_NAME} != ? """ - txn.execute(sql, (room_id, receipt_type, user_id, max_stream_id)) + txn.execute(sql, (room_id, receipt_type, user_id, row_id)) await self.db_pool.runInteraction( self.RECEIPTS_LINEARIZED_UNIQUE_INDEX_UPDATE_NAME, diff --git a/tests/storage/databases/main/test_receipts.py b/tests/storage/databases/main/test_receipts.py index 68026e283..ac77aec00 100644 --- a/tests/storage/databases/main/test_receipts.py +++ b/tests/storage/databases/main/test_receipts.py @@ -168,7 +168,9 @@ class ReceiptsBackgroundUpdateStoreTestCase(HomeserverTestCase): {"stream_id": 6, "event_id": "$some_event"}, ], (self.other_room_id, "m.read", self.user_id): [ - {"stream_id": 7, "event_id": "$some_event"} + # It is possible for stream IDs to be duplicated. + {"stream_id": 7, "event_id": "$some_event"}, + {"stream_id": 7, "event_id": "$some_event"}, ], }, expected_unique_receipts={