Merge pull request #4626 from matrix-org/erikj/fixup_user_ips_dedupe

Reduce user_ips bloat during dedupe background update
This commit is contained in:
Erik Johnston 2019-02-12 13:02:58 +00:00 committed by GitHub
commit 3df8fcca25
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 61 additions and 3 deletions

1
changelog.d/4626.misc Normal file
View File

@ -0,0 +1 @@
Improve 'user_ips' table deduplication background update

View File

@ -167,12 +167,16 @@ class ClientIpStore(background_updates.BackgroundUpdateStore):
clause = "? <= last_seen AND last_seen < ?"
args = (begin_last_seen, end_last_seen)
# (Note: The DISTINCT in the inner query is important to ensure that
# the COUNT(*) is accurate, otherwise double counting may happen due
# to the join effectively being a cross product)
txn.execute(
"""
SELECT user_id, access_token, ip,
MAX(device_id), MAX(user_agent), MAX(last_seen)
MAX(device_id), MAX(user_agent), MAX(last_seen),
COUNT(*)
FROM (
SELECT user_id, access_token, ip
SELECT DISTINCT user_id, access_token, ip
FROM user_ips
WHERE {}
) c
@ -186,7 +190,60 @@ class ClientIpStore(background_updates.BackgroundUpdateStore):
# We've got some duplicates
for i in res:
user_id, access_token, ip, device_id, user_agent, last_seen = i
user_id, access_token, ip, device_id, user_agent, last_seen, count = i
# We want to delete the duplicates so we end up with only a
# single row.
#
# The naive way of doing this would be just to delete all rows
# and reinsert a constructed row. However, if there are a lot of
# duplicate rows this can cause the table to grow a lot, which
# can be problematic in two ways:
# 1. If user_ips is already large then this can cause the
# table to rapidly grow, potentially filling the disk.
# 2. Reinserting a lot of rows can confuse the table
# statistics for postgres, causing it to not use the
# correct indices for the query above, resulting in a full
# table scan. This is incredibly slow for large tables and
# can kill database performance. (This seems to mainly
# happen for the last query where the clause is simply `? <
# last_seen`)
#
# So instead we want to delete all but *one* of the duplicate
# rows. That is hard to do reliably, so we cheat and do a two
# step process:
# 1. Delete all rows with a last_seen strictly less than the
# max last_seen. This hopefully results in deleting all but
# one row the majority of the time, but there may be
# duplicate last_seen
# 2. If multiple rows remain, we fall back to the naive method
# and simply delete all rows and reinsert.
#
# Note that this relies on no new duplicate rows being inserted,
# but if that is happening then this entire process is futile
# anyway.
# Do step 1:
txn.execute(
"""
DELETE FROM user_ips
WHERE user_id = ? AND access_token = ? AND ip = ? AND last_seen < ?
""",
(user_id, access_token, ip, last_seen)
)
if txn.rowcount == count - 1:
# We deleted all but one of the duplicate rows, i.e. there
# is exactly one remaining and so there is nothing left to
# do.
continue
elif txn.rowcount >= count:
raise Exception(
"We deleted more duplicate rows from 'user_ips' than expected",
)
# The previous step didn't delete enough rows, so we fallback to
# step 2:
# Drop all the duplicates
txn.execute(