From 18660a34d82ccb120efd2fa2480b42ac62dbe2b4 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff <1389908+richvdh@users.noreply.github.com> Date: Mon, 9 Dec 2019 11:55:30 +0000 Subject: [PATCH] Fix inaccurate per-block metrics (#6491) `Measure` incorrectly assumed that it was the only thing being done by the parent `LoggingContext`. For instance, during a "renew group attestations" operation, hundreds of `outbound_request` calls could take place in parallel, all using the same `LoggingContext`. This would mean that any resources used during *any* of those calls would be reported against *all* of them, producing wildly inaccurate results. Instead, we now give each `Measure` block its own `LoggingContext` (using the parent `LoggingContext` mechanism to ensure that the log lines look correct and that the metrics are ultimately propogated to the top level for reporting against requests/backgrond tasks). --- changelog.d/6491.bugfix | 1 + synapse/util/metrics.py | 60 +++++++++++++---------------------------- 2 files changed, 19 insertions(+), 42 deletions(-) create mode 100644 changelog.d/6491.bugfix diff --git a/changelog.d/6491.bugfix b/changelog.d/6491.bugfix new file mode 100644 index 000000000..78204693b --- /dev/null +++ b/changelog.d/6491.bugfix @@ -0,0 +1 @@ +Fix inaccurate per-block Prometheus metrics. diff --git a/synapse/util/metrics.py b/synapse/util/metrics.py index 63ddaaba8..7b1845546 100644 --- a/synapse/util/metrics.py +++ b/synapse/util/metrics.py @@ -91,72 +91,48 @@ class Measure(object): __slots__ = [ "clock", "name", - "start_context", + "_logging_context", "start", - "created_context", - "start_usage", ] def __init__(self, clock, name): self.clock = clock self.name = name - self.start_context = None + self._logging_context = None self.start = None - self.created_context = False def __enter__(self): + if self._logging_context: + raise RuntimeError("Measure() objects cannot be re-used") + self.start = self.clock.time() - self.start_context = LoggingContext.current_context() - if not self.start_context: - self.start_context = LoggingContext("Measure") - self.start_context.__enter__() - self.created_context = True - - self.start_usage = self.start_context.get_resource_usage() - + parent_context = LoggingContext.current_context() + self._logging_context = LoggingContext( + "Measure[%s]" % (self.name,), parent_context + ) + self._logging_context.__enter__() in_flight.register((self.name,), self._update_in_flight) def __exit__(self, exc_type, exc_val, exc_tb): - if isinstance(exc_type, Exception) or not self.start_context: - return - - in_flight.unregister((self.name,), self._update_in_flight) + if not self._logging_context: + raise RuntimeError("Measure() block exited without being entered") duration = self.clock.time() - self.start + usage = self._logging_context.get_resource_usage() - block_counter.labels(self.name).inc() - block_timer.labels(self.name).inc(duration) + in_flight.unregister((self.name,), self._update_in_flight) + self._logging_context.__exit__(exc_type, exc_val, exc_tb) - context = LoggingContext.current_context() - - if context != self.start_context: - logger.warning( - "Context has unexpectedly changed from '%s' to '%s'. (%r)", - self.start_context, - context, - self.name, - ) - return - - if not context: - logger.warning("Expected context. (%r)", self.name) - return - - current = context.get_resource_usage() - usage = current - self.start_usage try: + block_counter.labels(self.name).inc() + block_timer.labels(self.name).inc(duration) block_ru_utime.labels(self.name).inc(usage.ru_utime) block_ru_stime.labels(self.name).inc(usage.ru_stime) block_db_txn_count.labels(self.name).inc(usage.db_txn_count) block_db_txn_duration.labels(self.name).inc(usage.db_txn_duration_sec) block_db_sched_duration.labels(self.name).inc(usage.db_sched_duration_sec) except ValueError: - logger.warning( - "Failed to save metrics! OLD: %r, NEW: %r", self.start_usage, current - ) - - if self.created_context: - self.start_context.__exit__(exc_type, exc_val, exc_tb) + logger.warning("Failed to save metrics! Usage: %s", usage) def _update_in_flight(self, metrics): """Gets called when processing in flight metrics