Speed up rebuilding of the user directory for local users (#15529)

The idea here is to batch up the work.
This commit is contained in:
Erik Johnston 2023-05-03 14:41:37 +01:00 committed by GitHub
parent 9890f23469
commit fc3a878220
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 171 additions and 76 deletions

1
changelog.d/15529.misc Normal file
View File

@ -0,0 +1 @@
Speed up rebuilding of the user directory for local users.

View File

@ -386,13 +386,20 @@ class LoggingTransaction:
self.executemany(sql, args) self.executemany(sql, args)
def execute_values( def execute_values(
self, sql: str, values: Iterable[Iterable[Any]], fetch: bool = True self,
sql: str,
values: Iterable[Iterable[Any]],
template: Optional[str] = None,
fetch: bool = True,
) -> List[Tuple]: ) -> List[Tuple]:
"""Corresponds to psycopg2.extras.execute_values. Only available when """Corresponds to psycopg2.extras.execute_values. Only available when
using postgres. using postgres.
The `fetch` parameter must be set to False if the query does not return The `fetch` parameter must be set to False if the query does not return
rows (e.g. INSERTs). rows (e.g. INSERTs).
The `template` is the snippet to merge to every item in argslist to
compose the query.
""" """
assert isinstance(self.database_engine, PostgresEngine) assert isinstance(self.database_engine, PostgresEngine)
from psycopg2.extras import execute_values from psycopg2.extras import execute_values
@ -400,7 +407,9 @@ class LoggingTransaction:
return self._do_execute( return self._do_execute(
# TODO: is it safe for values to be Iterable[Iterable[Any]] here? # TODO: is it safe for values to be Iterable[Iterable[Any]] here?
# https://www.psycopg.org/docs/extras.html?highlight=execute_batch#psycopg2.extras.execute_values says values should be Sequence[Sequence] # https://www.psycopg.org/docs/extras.html?highlight=execute_batch#psycopg2.extras.execute_values says values should be Sequence[Sequence]
lambda the_sql: execute_values(self.txn, the_sql, values, fetch=fetch), lambda the_sql: execute_values(
self.txn, the_sql, values, template=template, fetch=fetch
),
sql, sql,
) )

View File

@ -27,6 +27,8 @@ from typing import (
cast, cast,
) )
import attr
try: try:
# Figure out if ICU support is available for searching users. # Figure out if ICU support is available for searching users.
import icu import icu
@ -66,6 +68,19 @@ logger = logging.getLogger(__name__)
TEMP_TABLE = "_temp_populate_user_directory" TEMP_TABLE = "_temp_populate_user_directory"
@attr.s(auto_attribs=True, frozen=True)
class _UserDirProfile:
"""Helper type for the user directory code for an entry to be inserted into
the directory.
"""
user_id: str
# If the display name or avatar URL are unexpected types, replace with None
display_name: Optional[str] = attr.ib(default=None, converter=non_null_str_or_none)
avatar_url: Optional[str] = attr.ib(default=None, converter=non_null_str_or_none)
class UserDirectoryBackgroundUpdateStore(StateDeltasStore): class UserDirectoryBackgroundUpdateStore(StateDeltasStore):
# How many records do we calculate before sending it to # How many records do we calculate before sending it to
# add_users_who_share_private_rooms? # add_users_who_share_private_rooms?
@ -381,25 +396,65 @@ class UserDirectoryBackgroundUpdateStore(StateDeltasStore):
% (len(users_to_work_on), progress["remaining"]) % (len(users_to_work_on), progress["remaining"])
) )
for user_id in users_to_work_on: # First filter down to users we want to insert into the user directory.
if await self.should_include_local_user_in_dir(user_id): users_to_insert = [
profile = await self.get_profileinfo(get_localpart_from_id(user_id)) # type: ignore[attr-defined] user_id
await self.update_profile_in_user_dir( for user_id in users_to_work_on
user_id, profile.display_name, profile.avatar_url if await self.should_include_local_user_in_dir(user_id)
) ]
# We've finished processing a user. Delete it from the table. # Next fetch their profiles. Note that the `user_id` here is the
await self.db_pool.simple_delete_one( # *localpart*, and that not all users have profiles.
TEMP_TABLE + "_users", {"user_id": user_id} profile_rows = await self.db_pool.simple_select_many_batch(
) table="profiles",
# Update the remaining counter. column="user_id",
progress["remaining"] -= 1 iterable=[get_localpart_from_id(u) for u in users_to_insert],
await self.db_pool.runInteraction( retcols=(
"populate_user_directory", "user_id",
self.db_pool.updates._background_update_progress_txn, "displayname",
"populate_user_directory_process_users", "avatar_url",
progress, ),
keyvalues={},
desc="populate_user_directory_process_users_get_profiles",
)
profiles = {
f"@{row['user_id']}:{self.server_name}": _UserDirProfile(
f"@{row['user_id']}:{self.server_name}",
row["displayname"],
row["avatar_url"],
) )
for row in profile_rows
}
profiles_to_insert = [
profiles.get(user_id) or _UserDirProfile(user_id)
for user_id in users_to_insert
]
# Actually insert the users with their profiles into the directory.
await self.db_pool.runInteraction(
"populate_user_directory_process_users_insertion",
self._update_profiles_in_user_dir_txn,
profiles_to_insert,
)
# We've finished processing the users. Delete it from the table.
await self.db_pool.simple_delete_many(
table=TEMP_TABLE + "_users",
column="user_id",
iterable=users_to_work_on,
keyvalues={},
desc="populate_user_directory_process_users_delete",
)
# Update the remaining counter.
progress["remaining"] -= len(users_to_work_on)
await self.db_pool.runInteraction(
"populate_user_directory",
self.db_pool.updates._background_update_progress_txn,
"populate_user_directory_process_users",
progress,
)
return len(users_to_work_on) return len(users_to_work_on)
@ -584,72 +639,102 @@ class UserDirectoryBackgroundUpdateStore(StateDeltasStore):
Update or add a user's profile in the user directory. Update or add a user's profile in the user directory.
If the user is remote, the profile will be marked as not stale. If the user is remote, the profile will be marked as not stale.
""" """
# If the display name or avatar URL are unexpected types, replace with None. await self.db_pool.runInteraction(
display_name = non_null_str_or_none(display_name) "update_profiles_in_user_dir",
avatar_url = non_null_str_or_none(avatar_url) self._update_profiles_in_user_dir_txn,
[_UserDirProfile(user_id, display_name, avatar_url)],
)
def _update_profile_in_user_dir_txn(txn: LoggingTransaction) -> None: def _update_profiles_in_user_dir_txn(
self.db_pool.simple_upsert_txn( self,
txn: LoggingTransaction,
profiles: Sequence[_UserDirProfile],
) -> None:
self.db_pool.simple_upsert_many_txn(
txn,
table="user_directory",
key_names=("user_id",),
key_values=[(p.user_id,) for p in profiles],
value_names=("display_name", "avatar_url"),
value_values=[
(
p.display_name,
p.avatar_url,
)
for p in profiles
],
)
# Remote users: Make sure the profile is not marked as stale anymore.
remote_users = [
p.user_id for p in profiles if not self.hs.is_mine_id(p.user_id)
]
if remote_users:
self.db_pool.simple_delete_many_txn(
txn, txn,
table="user_directory", table="user_directory_stale_remote_users",
keyvalues={"user_id": user_id}, column="user_id",
values={"display_name": display_name, "avatar_url": avatar_url}, values=remote_users,
keyvalues={},
) )
if not self.hs.is_mine_id(user_id): if isinstance(self.database_engine, PostgresEngine):
# Remote users: Make sure the profile is not marked as stale anymore. # We weight the localpart most highly, then display name and finally
self.db_pool.simple_delete_txn( # server name
txn, template = """
table="user_directory_stale_remote_users", (
keyvalues={"user_id": user_id}, %s,
setweight(to_tsvector('simple', %s), 'A')
|| setweight(to_tsvector('simple', %s), 'D')
|| setweight(to_tsvector('simple', COALESCE(%s, '')), 'B')
) )
"""
# The display name that goes into the database index. sql = """
index_display_name = display_name INSERT INTO user_directory_search(user_id, vector)
if index_display_name is not None: VALUES ? ON CONFLICT (user_id) DO UPDATE SET vector=EXCLUDED.vector
index_display_name = _filter_text_for_index(index_display_name) """
txn.execute_values(
if isinstance(self.database_engine, PostgresEngine): sql,
# We weight the localpart most highly, then display name and finally [
# server name
sql = """
INSERT INTO user_directory_search(user_id, vector)
VALUES (?,
setweight(to_tsvector('simple', ?), 'A')
|| setweight(to_tsvector('simple', ?), 'D')
|| setweight(to_tsvector('simple', COALESCE(?, '')), 'B')
) ON CONFLICT (user_id) DO UPDATE SET vector=EXCLUDED.vector
"""
txn.execute(
sql,
( (
user_id, p.user_id,
get_localpart_from_id(user_id), get_localpart_from_id(p.user_id),
get_domain_from_id(user_id), get_domain_from_id(p.user_id),
index_display_name, _filter_text_for_index(p.display_name)
), if p.display_name
) else None,
elif isinstance(self.database_engine, Sqlite3Engine): )
value = ( for p in profiles
"%s %s" % (user_id, index_display_name) ],
if index_display_name template=template,
else user_id fetch=False,
) )
self.db_pool.simple_upsert_txn( elif isinstance(self.database_engine, Sqlite3Engine):
txn, values = []
table="user_directory_search", for p in profiles:
keyvalues={"user_id": user_id}, if p.display_name is not None:
values={"value": value}, index_display_name = _filter_text_for_index(p.display_name)
) value = f"{p.user_id} {index_display_name}"
else: else:
# This should be unreachable. value = p.user_id
raise Exception("Unrecognized database engine")
txn.call_after(self.get_user_in_directory.invalidate, (user_id,)) values.append((value,))
await self.db_pool.runInteraction( self.db_pool.simple_upsert_many_txn(
"update_profile_in_user_dir", _update_profile_in_user_dir_txn txn,
) table="user_directory_search",
key_names=("user_id",),
key_values=[(p.user_id,) for p in profiles],
value_names=("value",),
value_values=values,
)
else:
# This should be unreachable.
raise Exception("Unrecognized database engine")
for p in profiles:
txn.call_after(self.get_user_in_directory.invalidate, (p.user_id,))
async def add_users_who_share_private_room( async def add_users_who_share_private_room(
self, room_id: str, user_id_tuples: Iterable[Tuple[str, str]] self, room_id: str, user_id_tuples: Iterable[Tuple[str, str]]