From d162aecaac52fb467822e319e4c3c5b216c33ca9 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Wed, 7 Jun 2023 18:12:23 +0100 Subject: [PATCH] Quick & dirty metric for background update status (#15740) * Quick & dirty metric for background update status * Changelog * Remove debug Co-authored-by: Mathieu Velten * Actually write to _aborted --------- Co-authored-by: Mathieu Velten --- changelog.d/15740.feature | 1 + synapse/metrics/__init__.py | 2 ++ synapse/storage/background_updates.py | 30 +++++++++++++++++++++++++++ synapse/storage/database.py | 8 ++++++- 4 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 changelog.d/15740.feature diff --git a/changelog.d/15740.feature b/changelog.d/15740.feature new file mode 100644 index 000000000..fed342ea5 --- /dev/null +++ b/changelog.d/15740.feature @@ -0,0 +1 @@ +Expose a metric reporting the database background update status. diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index 8ce588722..39fc62993 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -77,6 +77,8 @@ RegistryProxy = cast(CollectorRegistry, _RegistryProxy) @attr.s(slots=True, hash=True, auto_attribs=True) class LaterGauge(Collector): + """A Gauge which periodically calls a user-provided callback to produce metrics.""" + name: str desc: str labels: Optional[Sequence[str]] = attr.ib(hash=False) diff --git a/synapse/storage/background_updates.py b/synapse/storage/background_updates.py index ca085ef80..edc97a9d6 100644 --- a/synapse/storage/background_updates.py +++ b/synapse/storage/background_updates.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +from enum import IntEnum from types import TracebackType from typing import ( TYPE_CHECKING, @@ -136,6 +137,15 @@ class BackgroundUpdatePerformance: return float(self.total_item_count) / float(self.total_duration_ms) +class UpdaterStatus(IntEnum): + # Use negative values for error conditions. + ABORTED = -1 + DISABLED = 0 + NOT_STARTED = 1 + RUNNING_UPDATE = 2 + COMPLETE = 3 + + class BackgroundUpdater: """Background updates are updates to the database that run in the background. Each update processes a batch of data at once. We attempt to @@ -158,11 +168,16 @@ class BackgroundUpdater: self._background_update_performance: Dict[str, BackgroundUpdatePerformance] = {} self._background_update_handlers: Dict[str, _BackgroundUpdateHandler] = {} + # TODO: all these bool flags make me feel icky---can we combine into a status + # enum? self._all_done = False # Whether we're currently running updates self._running = False + # Marker to be set if we abort and halt all background updates. + self._aborted = False + # Whether background updates are enabled. This allows us to # enable/disable background updates via the admin API. self.enabled = True @@ -175,6 +190,20 @@ class BackgroundUpdater: self.sleep_duration_ms = hs.config.background_updates.sleep_duration_ms self.sleep_enabled = hs.config.background_updates.sleep_enabled + def get_status(self) -> UpdaterStatus: + """An integer summarising the updater status. Used as a metric.""" + if self._aborted: + return UpdaterStatus.ABORTED + # TODO: a status for "have seen at least one failure, but haven't aborted yet". + if not self.enabled: + return UpdaterStatus.DISABLED + + if self._all_done: + return UpdaterStatus.COMPLETE + if self._running: + return UpdaterStatus.RUNNING_UPDATE + return UpdaterStatus.NOT_STARTED + def register_update_controller_callbacks( self, on_update: ON_UPDATE_CALLBACK, @@ -296,6 +325,7 @@ class BackgroundUpdater: except Exception: back_to_back_failures += 1 if back_to_back_failures >= 5: + self._aborted = True raise RuntimeError( "5 back-to-back background update failures; aborting." ) diff --git a/synapse/storage/database.py b/synapse/storage/database.py index bdaa508db..10fa6c480 100644 --- a/synapse/storage/database.py +++ b/synapse/storage/database.py @@ -54,7 +54,7 @@ from synapse.logging.context import ( current_context, make_deferred_yieldable, ) -from synapse.metrics import register_threadpool +from synapse.metrics import LaterGauge, register_threadpool from synapse.metrics.background_process_metrics import run_as_background_process from synapse.storage.background_updates import BackgroundUpdater from synapse.storage.engines import BaseDatabaseEngine, PostgresEngine, Sqlite3Engine @@ -547,6 +547,12 @@ class DatabasePool: self._db_pool = make_pool(hs.get_reactor(), database_config, engine) self.updates = BackgroundUpdater(hs, self) + LaterGauge( + "synapse_background_update_status", + "Background update status", + [], + self.updates.get_status, + ) self._previous_txn_total_time = 0.0 self._current_txn_total_time = 0.0