Fix background updates failing to add unique indexes on receipts (#14453)

As part of the database migration to support threaded receipts, there is
a possible window in between
`73/08thread_receipts_non_null.sql.postgres` removing the original
unique constraints on `receipts_linearized` and `receipts_graph` and the
`reeipts_linearized_unique_index` and `receipts_graph_unique_index`
background updates from `72/08thread_receipts.sql` completing where
the unique constraints on `receipts_linearized` and `receipts_graph` are
missing. Any emulated upserts on these tables must therefore be
performed with a lock held, otherwise duplicate rows can end up in the
tables when there are concurrent emulated upserts. Fix the missing lock.

Note that emulated upserts no longer happen by default on sqlite, since
the minimum supported version of sqlite supports native upserts by
default now.

Finally, clean up any duplicate receipts that may have crept in before
trying to create the `receipts_graph_unique_index` and
`receipts_linearized_unique_index` unique indexes.

Signed-off-by: Sean Quah <seanq@matrix.org>
This commit is contained in:
Sean Quah 2022-11-16 15:01:22 +00:00 committed by GitHub
parent d63814fd73
commit 882277008c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 357 additions and 24 deletions

View file

@ -113,24 +113,6 @@ class ReceiptsWorkerStore(SQLBaseStore):
prefilled_cache=receipts_stream_prefill,
)
self.db_pool.updates.register_background_index_update(
"receipts_linearized_unique_index",
index_name="receipts_linearized_unique_index",
table="receipts_linearized",
columns=["room_id", "receipt_type", "user_id"],
where_clause="thread_id IS NULL",
unique=True,
)
self.db_pool.updates.register_background_index_update(
"receipts_graph_unique_index",
index_name="receipts_graph_unique_index",
table="receipts_graph",
columns=["room_id", "receipt_type", "user_id"],
where_clause="thread_id IS NULL",
unique=True,
)
def get_max_receipt_stream_id(self) -> int:
"""Get the current max stream ID for receipts stream"""
return self._receipts_id_gen.get_current_token()
@ -702,9 +684,6 @@ class ReceiptsWorkerStore(SQLBaseStore):
"data": json_encoder.encode(data),
},
where_clause=where_clause,
# receipts_linearized has a unique constraint on
# (user_id, room_id, receipt_type), so no need to lock
lock=False,
)
return rx_ts
@ -862,14 +841,13 @@ class ReceiptsWorkerStore(SQLBaseStore):
"data": json_encoder.encode(data),
},
where_clause=where_clause,
# receipts_graph has a unique constraint on
# (user_id, room_id, receipt_type), so no need to lock
lock=False,
)
class ReceiptsBackgroundUpdateStore(SQLBaseStore):
POPULATE_RECEIPT_EVENT_STREAM_ORDERING = "populate_event_stream_ordering"
RECEIPTS_LINEARIZED_UNIQUE_INDEX_UPDATE_NAME = "receipts_linearized_unique_index"
RECEIPTS_GRAPH_UNIQUE_INDEX_UPDATE_NAME = "receipts_graph_unique_index"
def __init__(
self,
@ -883,6 +861,14 @@ class ReceiptsBackgroundUpdateStore(SQLBaseStore):
self.POPULATE_RECEIPT_EVENT_STREAM_ORDERING,
self._populate_receipt_event_stream_ordering,
)
self.db_pool.updates.register_background_update_handler(
self.RECEIPTS_LINEARIZED_UNIQUE_INDEX_UPDATE_NAME,
self._background_receipts_linearized_unique_index,
)
self.db_pool.updates.register_background_update_handler(
self.RECEIPTS_GRAPH_UNIQUE_INDEX_UPDATE_NAME,
self._background_receipts_graph_unique_index,
)
async def _populate_receipt_event_stream_ordering(
self, progress: JsonDict, batch_size: int
@ -938,6 +924,143 @@ class ReceiptsBackgroundUpdateStore(SQLBaseStore):
return batch_size
async def _create_receipts_index(self, index_name: str, table: str) -> None:
"""Adds a unique index on `(room_id, receipt_type, user_id)` to the given
receipts table, for non-thread receipts."""
def _create_index(conn: LoggingDatabaseConnection) -> None:
conn.rollback()
# we have to set autocommit, because postgres refuses to
# CREATE INDEX CONCURRENTLY without it.
if isinstance(self.database_engine, PostgresEngine):
conn.set_session(autocommit=True)
try:
c = conn.cursor()
# Now that the duplicates are gone, we can create the index.
concurrently = (
"CONCURRENTLY"
if isinstance(self.database_engine, PostgresEngine)
else ""
)
sql = f"""
CREATE UNIQUE INDEX {concurrently} {index_name}
ON {table}(room_id, receipt_type, user_id)
WHERE thread_id IS NULL
"""
c.execute(sql)
finally:
if isinstance(self.database_engine, PostgresEngine):
conn.set_session(autocommit=False)
await self.db_pool.runWithConnection(_create_index)
async def _background_receipts_linearized_unique_index(
self, progress: dict, batch_size: int
) -> int:
"""Removes duplicate receipts and adds a unique index on
`(room_id, receipt_type, user_id)` to `receipts_linearized`, for non-thread
receipts."""
def _remote_duplicate_receipts_txn(txn: LoggingTransaction) -> None:
# Identify any duplicate receipts arising from
# https://github.com/matrix-org/synapse/issues/14406.
# We expect the following query to use the per-thread receipt index and take
# less than a minute.
sql = """
SELECT MAX(stream_id), room_id, receipt_type, user_id
FROM receipts_linearized
WHERE thread_id IS NULL
GROUP BY room_id, receipt_type, user_id
HAVING COUNT(*) > 1
"""
txn.execute(sql)
duplicate_keys = cast(List[Tuple[int, str, str, str]], list(txn))
# Then remove duplicate receipts, keeping the one with the highest
# `stream_id`. There should only be a single receipt with any given
# `stream_id`.
for max_stream_id, room_id, receipt_type, user_id in duplicate_keys:
sql = """
DELETE FROM receipts_linearized
WHERE
room_id = ? AND
receipt_type = ? AND
user_id = ? AND
thread_id IS NULL AND
stream_id < ?
"""
txn.execute(sql, (room_id, receipt_type, user_id, max_stream_id))
await self.db_pool.runInteraction(
self.RECEIPTS_LINEARIZED_UNIQUE_INDEX_UPDATE_NAME,
_remote_duplicate_receipts_txn,
)
await self._create_receipts_index(
"receipts_linearized_unique_index",
"receipts_linearized",
)
await self.db_pool.updates._end_background_update(
self.RECEIPTS_LINEARIZED_UNIQUE_INDEX_UPDATE_NAME
)
return 1
async def _background_receipts_graph_unique_index(
self, progress: dict, batch_size: int
) -> int:
"""Removes duplicate receipts and adds a unique index on
`(room_id, receipt_type, user_id)` to `receipts_graph`, for non-thread
receipts."""
def _remote_duplicate_receipts_txn(txn: LoggingTransaction) -> None:
# Identify any duplicate receipts arising from
# https://github.com/matrix-org/synapse/issues/14406.
# We expect the following query to use the per-thread receipt index and take
# less than a minute.
sql = """
SELECT room_id, receipt_type, user_id FROM receipts_graph
WHERE thread_id IS NULL
GROUP BY room_id, receipt_type, user_id
HAVING COUNT(*) > 1
"""
txn.execute(sql)
duplicate_keys = cast(List[Tuple[str, str, str]], list(txn))
# Then remove all duplicate receipts.
# We could be clever and try to keep the latest receipt out of every set of
# duplicates, but it's far simpler to remove them all.
for room_id, receipt_type, user_id in duplicate_keys:
sql = """
DELETE FROM receipts_graph
WHERE
room_id = ? AND
receipt_type = ? AND
user_id = ? AND
thread_id IS NULL
"""
txn.execute(sql, (room_id, receipt_type, user_id))
await self.db_pool.runInteraction(
self.RECEIPTS_GRAPH_UNIQUE_INDEX_UPDATE_NAME,
_remote_duplicate_receipts_txn,
)
await self._create_receipts_index(
"receipts_graph_unique_index",
"receipts_graph",
)
await self.db_pool.updates._end_background_update(
self.RECEIPTS_GRAPH_UNIQUE_INDEX_UPDATE_NAME
)
return 1
class ReceiptsStore(ReceiptsWorkerStore, ReceiptsBackgroundUpdateStore):
pass