Refresh remote profiles that have been marked as stale, in order to fill the user directory. [rei:userdirpriv] (#14756)

* Scaffolding for background process to refresh profiles

* Add scaffolding for background process to refresh profiles for a given server

* Implement the code to select servers to refresh from

* Ensure we don't build up multiple looping calls

* Make `get_profile` able to respect backoffs

* Add logic for refreshing users

* When backing off, schedule a refresh when the backoff is over

* Wake up the background processes when we receive an interesting state event

* Add tests

* Newsfile

Signed-off-by: Olivier Wilkinson (reivilibre) <oliverw@matrix.org>

* Add comment about 1<<62

---------

Signed-off-by: Olivier Wilkinson (reivilibre) <oliverw@matrix.org>
This commit is contained in:
reivilibre 2023-03-16 11:44:11 +00:00 committed by GitHub
parent 4953cd71df
commit 1f5473465d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 504 additions and 4 deletions

View file

@ -13,15 +13,22 @@
# limitations under the License.
import logging
from http import HTTPStatus
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple
from twisted.internet.interfaces import IDelayedCall
import synapse.metrics
from synapse.api.constants import EventTypes, HistoryVisibility, JoinRules, Membership
from synapse.api.errors import Codes, SynapseError
from synapse.handlers.state_deltas import MatchChange, StateDeltasHandler
from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.storage.databases.main.user_directory import SearchResult
from synapse.storage.roommember import ProfileInfo
from synapse.types import UserID
from synapse.util.metrics import Measure
from synapse.util.retryutils import NotRetryingDestination
from synapse.util.stringutils import non_null_str_or_none
if TYPE_CHECKING:
from synapse.server import HomeServer
@ -33,6 +40,25 @@ logger = logging.getLogger(__name__)
# then be coalesced such that only one /profile request is made).
USER_DIRECTORY_STALE_REFRESH_TIME_MS = 60 * 1000
# Maximum number of remote servers that we will attempt to refresh profiles for
# in one go.
MAX_SERVERS_TO_REFRESH_PROFILES_FOR_IN_ONE_GO = 5
# As long as we have servers to refresh (without backoff), keep adding more
# every 15 seconds.
INTERVAL_TO_ADD_MORE_SERVERS_TO_REFRESH_PROFILES = 15
def calculate_time_of_next_retry(now_ts: int, retry_count: int) -> int:
"""
Calculates the time of a next retry given `now_ts` in ms and the number
of failures encountered thus far.
Currently the sequence goes:
1 min, 5 min, 25 min, 2 hour, 10 hour, 52 hour, 10 day, 7.75 week
"""
return now_ts + 60_000 * (5 ** min(retry_count, 7))
class UserDirectoryHandler(StateDeltasHandler):
"""Handles queries and updates for the user_directory.
@ -69,12 +95,24 @@ class UserDirectoryHandler(StateDeltasHandler):
self.update_user_directory = hs.config.worker.should_update_user_directory
self.search_all_users = hs.config.userdirectory.user_directory_search_all_users
self.spam_checker = hs.get_spam_checker()
self._hs = hs
# The current position in the current_state_delta stream
self.pos: Optional[int] = None
# Guard to ensure we only process deltas one at a time
self._is_processing = False
# Guard to ensure we only have one process for refreshing remote profiles
self._is_refreshing_remote_profiles = False
# Handle to cancel the `call_later` of `kick_off_remote_profile_refresh_process`
self._refresh_remote_profiles_call_later: Optional[IDelayedCall] = None
# Guard to ensure we only have one process for refreshing remote profiles
# for the given servers.
# Set of server names.
self._is_refreshing_remote_profiles_for_servers: Set[str] = set()
if self.update_user_directory:
self.notifier.add_replication_callback(self.notify_new_event)
@ -82,6 +120,11 @@ class UserDirectoryHandler(StateDeltasHandler):
# we start populating the user directory
self.clock.call_later(0, self.notify_new_event)
# Kick off the profile refresh process on startup
self._refresh_remote_profiles_call_later = self.clock.call_later(
10, self.kick_off_remote_profile_refresh_process
)
async def search_users(
self, user_id: str, search_term: str, limit: int
) -> SearchResult:
@ -483,6 +526,20 @@ class UserDirectoryHandler(StateDeltasHandler):
next_try_at_ms=now_ts + USER_DIRECTORY_STALE_REFRESH_TIME_MS,
retry_counter=0,
)
# Schedule a wake-up to refresh the user directory for this server.
# We intentionally wake up this server directly because we don't want
# other servers ahead of it in the queue to get in the way of updating
# the profile if the server only just sent us an event.
self.clock.call_later(
USER_DIRECTORY_STALE_REFRESH_TIME_MS // 1000 + 1,
self.kick_off_remote_profile_refresh_process_for_remote_server,
UserID.from_string(user_id).domain,
)
# Schedule a wake-up to handle any backoffs that may occur in the future.
self.clock.call_later(
2 * USER_DIRECTORY_STALE_REFRESH_TIME_MS // 1000 + 1,
self.kick_off_remote_profile_refresh_process,
)
return
prev_name = prev_event.content.get("displayname")
@ -505,3 +562,188 @@ class UserDirectoryHandler(StateDeltasHandler):
# Only update if something has changed, or we didn't have a previous event
# in the first place.
await self.store.update_profile_in_user_dir(user_id, new_name, new_avatar)
def kick_off_remote_profile_refresh_process(self) -> None:
"""Called when there may be remote users with stale profiles to be refreshed"""
if not self.update_user_directory:
return
if self._is_refreshing_remote_profiles:
return
if self._refresh_remote_profiles_call_later:
if self._refresh_remote_profiles_call_later.active():
self._refresh_remote_profiles_call_later.cancel()
self._refresh_remote_profiles_call_later = None
async def process() -> None:
try:
await self._unsafe_refresh_remote_profiles()
finally:
self._is_refreshing_remote_profiles = False
self._is_refreshing_remote_profiles = True
run_as_background_process("user_directory.refresh_remote_profiles", process)
async def _unsafe_refresh_remote_profiles(self) -> None:
limit = MAX_SERVERS_TO_REFRESH_PROFILES_FOR_IN_ONE_GO - len(
self._is_refreshing_remote_profiles_for_servers
)
if limit <= 0:
# nothing to do: already refreshing the maximum number of servers
# at once.
# Come back later.
self._refresh_remote_profiles_call_later = self.clock.call_later(
INTERVAL_TO_ADD_MORE_SERVERS_TO_REFRESH_PROFILES,
self.kick_off_remote_profile_refresh_process,
)
return
servers_to_refresh = (
await self.store.get_remote_servers_with_profiles_to_refresh(
now_ts=self.clock.time_msec(), limit=limit
)
)
if not servers_to_refresh:
# Do we have any backing-off servers that we should try again
# for eventually?
# By setting `now` is a point in the far future, we can ask for
# which server/user is next to be refreshed, even though it is
# not actually refreshable *now*.
end_of_time = 1 << 62
backing_off_servers = (
await self.store.get_remote_servers_with_profiles_to_refresh(
now_ts=end_of_time, limit=1
)
)
if backing_off_servers:
# Find out when the next user is refreshable and schedule a
# refresh then.
backing_off_server_name = backing_off_servers[0]
users = await self.store.get_remote_users_to_refresh_on_server(
backing_off_server_name, now_ts=end_of_time, limit=1
)
if not users:
return
_, _, next_try_at_ts = users[0]
self._refresh_remote_profiles_call_later = self.clock.call_later(
((next_try_at_ts - self.clock.time_msec()) // 1000) + 2,
self.kick_off_remote_profile_refresh_process,
)
return
for server_to_refresh in servers_to_refresh:
self.kick_off_remote_profile_refresh_process_for_remote_server(
server_to_refresh
)
self._refresh_remote_profiles_call_later = self.clock.call_later(
INTERVAL_TO_ADD_MORE_SERVERS_TO_REFRESH_PROFILES,
self.kick_off_remote_profile_refresh_process,
)
def kick_off_remote_profile_refresh_process_for_remote_server(
self, server_name: str
) -> None:
"""Called when there may be remote users with stale profiles to be refreshed
on the given server."""
if not self.update_user_directory:
return
if server_name in self._is_refreshing_remote_profiles_for_servers:
return
async def process() -> None:
try:
await self._unsafe_refresh_remote_profiles_for_remote_server(
server_name
)
finally:
self._is_refreshing_remote_profiles_for_servers.remove(server_name)
self._is_refreshing_remote_profiles_for_servers.add(server_name)
run_as_background_process(
"user_directory.refresh_remote_profiles_for_remote_server", process
)
async def _unsafe_refresh_remote_profiles_for_remote_server(
self, server_name: str
) -> None:
logger.info("Refreshing profiles in user directory for %s", server_name)
while True:
# Get a handful of users to process.
next_batch = await self.store.get_remote_users_to_refresh_on_server(
server_name, now_ts=self.clock.time_msec(), limit=10
)
if not next_batch:
# Finished for now
return
for user_id, retry_counter, _ in next_batch:
# Request the profile of the user.
try:
profile = await self._hs.get_profile_handler().get_profile(
user_id, ignore_backoff=False
)
except NotRetryingDestination as e:
logger.info(
"Failed to refresh profile for %r because the destination is undergoing backoff",
user_id,
)
# As a special-case, we back off until the destination is no longer
# backed off from.
await self.store.set_remote_user_profile_in_user_dir_stale(
user_id,
e.retry_last_ts + e.retry_interval,
retry_counter=retry_counter + 1,
)
continue
except SynapseError as e:
if e.code == HTTPStatus.NOT_FOUND and e.errcode == Codes.NOT_FOUND:
# The profile doesn't exist.
# TODO Does this mean we should clear it from our user
# directory?
await self.store.clear_remote_user_profile_in_user_dir_stale(
user_id
)
logger.warning(
"Refresh of remote profile %r: not found (%r)",
user_id,
e.msg,
)
continue
logger.warning(
"Failed to refresh profile for %r because %r", user_id, e
)
await self.store.set_remote_user_profile_in_user_dir_stale(
user_id,
calculate_time_of_next_retry(
self.clock.time_msec(), retry_counter + 1
),
retry_counter=retry_counter + 1,
)
continue
except Exception:
logger.error(
"Failed to refresh profile for %r due to unhandled exception",
user_id,
exc_info=True,
)
await self.store.set_remote_user_profile_in_user_dir_stale(
user_id,
calculate_time_of_next_retry(
self.clock.time_msec(), retry_counter + 1
),
retry_counter=retry_counter + 1,
)
continue
await self.store.update_profile_in_user_dir(
user_id,
display_name=non_null_str_or_none(profile.get("displayname")),
avatar_url=non_null_str_or_none(profile.get("avatar_url")),
)