From 68128d56269e51021797a979306a08ca4018b207 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Wed, 19 Jun 2019 20:49:04 +0100 Subject: [PATCH 1/6] Remove unused _get_event_counters This has been redundant since cdb3757942fefdcdc3d33b9c6d7c9e44decefd6f. --- synapse/storage/_base.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/synapse/storage/_base.py b/synapse/storage/_base.py index 941c07fce..75cffb842 100644 --- a/synapse/storage/_base.py +++ b/synapse/storage/_base.py @@ -213,7 +213,6 @@ class SQLBaseStore(object): # is running in mainline, and we have some nice monitoring frontends # to watch it self._txn_perf_counters = PerformanceCounters() - self._get_event_counters = PerformanceCounters() self._get_event_cache = Cache( "*getEvent*", keylen=3, max_entries=hs.config.event_cache_size @@ -369,15 +368,10 @@ class SQLBaseStore(object): time_now - time_then, limit=3 ) - top_3_event_counters = self._get_event_counters.interval( - time_now - time_then, limit=3 - ) - perf_logger.info( - "Total database time: %.3f%% {%s} {%s}", + "Total database time: %.3f%% {%s}", ratio * 100, top_three_counters, - top_3_event_counters, ) self._clock.looping_call(loop, 10000) From f682af052dd16864d9ac1484cededf5e4c33ad02 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Wed, 19 Jun 2019 20:58:36 +0100 Subject: [PATCH 2/6] Simplify PerformanceCounters.update interface we already have the duration for the update, so may as well use it rather than passing extra params around and recalculating it. --- synapse/storage/_base.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/synapse/storage/_base.py b/synapse/storage/_base.py index 75cffb842..910f6ee9d 100644 --- a/synapse/storage/_base.py +++ b/synapse/storage/_base.py @@ -167,22 +167,22 @@ class PerformanceCounters(object): self.current_counters = {} self.previous_counters = {} - def update(self, key, start_time, end_time=None): - if end_time is None: - end_time = time.time() - duration = end_time - start_time + def update(self, key, duration_secs): count, cum_time = self.current_counters.get(key, (0, 0)) count += 1 - cum_time += duration + cum_time += duration_secs self.current_counters[key] = (count, cum_time) - return end_time - def interval(self, interval_duration, limit=3): + def interval(self, interval_duration_secs, limit=3): counters = [] for name, (count, cum_time) in iteritems(self.current_counters): prev_count, prev_time = self.previous_counters.get(name, (0, 0)) counters.append( - ((cum_time - prev_time) / interval_duration, count - prev_count, name) + ( + (cum_time - prev_time) / interval_duration_secs, + count - prev_count, + name, + ) ) self.previous_counters = dict(self.current_counters) @@ -362,10 +362,11 @@ class SQLBaseStore(object): time_then = self._previous_loop_ts self._previous_loop_ts = time_now - ratio = (curr - prev) / (time_now - time_then) + duration = time_now - time_then + ratio = (curr - prev) / duration top_three_counters = self._txn_perf_counters.interval( - time_now - time_then, limit=3 + duration, limit=3 ) perf_logger.info( @@ -453,7 +454,7 @@ class SQLBaseStore(object): transaction_logger.debug("[TXN END] {%s} %f sec", name, duration) self._current_txn_total_time += duration - self._txn_perf_counters.update(desc, start, end) + self._txn_perf_counters.update(desc, duration) sql_txn_timer.labels(desc).observe(duration) @defer.inlineCallbacks From fe641df770ae38a70ce5c17b3112cf57ec93f06d Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Wed, 19 Jun 2019 20:55:53 +0100 Subject: [PATCH 3/6] Sanity-checking for metrics updates Check that our clocks go forward. --- synapse/util/logcontext.py | 40 +++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/synapse/util/logcontext.py b/synapse/util/logcontext.py index fe412355d..645d71282 100644 --- a/synapse/util/logcontext.py +++ b/synapse/util/logcontext.py @@ -325,10 +325,9 @@ class LoggingContext(object): ) return - usage_end = get_thread_resource_usage() - - self._resource_usage.ru_utime += usage_end.ru_utime - self.usage_start.ru_utime - self._resource_usage.ru_stime += usage_end.ru_stime - self.usage_start.ru_stime + utime_delta, stime_delta = self._get_cputime() + self._resource_usage.ru_utime += utime_delta + self._resource_usage.ru_stime += stime_delta self.usage_start = None @@ -346,13 +345,38 @@ class LoggingContext(object): # can include resource usage so far. is_main_thread = threading.current_thread() is self.main_thread if self.alive and self.usage_start and is_main_thread: - current = get_thread_resource_usage() - res.ru_utime += current.ru_utime - self.usage_start.ru_utime - res.ru_stime += current.ru_stime - self.usage_start.ru_stime + utime_delta, stime_delta = self._get_cputime() + res.ru_utime += utime_delta + res.ru_stime += stime_delta return res + def _get_cputime(self): + """Get the cpu usage time so far + + Returns: Tuple[float, float]: seconds in user mode, seconds in system mode + """ + current = get_thread_resource_usage() + + utime_delta = current.ru_utime - self.usage_start.ru_utime + stime_delta = current.ru_stime - self.usage_start.ru_stime + + # sanity check + if utime_delta < 0: + raise ValueError("utime went backwards! %f < %f" % ( + current.ru_utime, self.usage_start.ru_utime, + )) + + if stime_delta < 0: + raise ValueError("stime went backwards! %f < %f" % ( + current.ru_stime, self.usage_start.ru_stime, + )) + + return utime_delta, stime_delta + def add_database_transaction(self, duration_sec): + if duration_sec < 0: + raise ValueError('DB txn time can only be non-negative') self._resource_usage.db_txn_count += 1 self._resource_usage.db_txn_duration_sec += duration_sec @@ -363,6 +387,8 @@ class LoggingContext(object): sched_sec (float): number of seconds it took us to get a connection """ + if sched_sec < 0: + raise ValueError('DB scheduling time can only be non-negative') self._resource_usage.db_sched_duration_sec += sched_sec def record_event_fetch(self, event_count): From ae4d97bf05b8fc5eca2dd12ce169213545a40c5f Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Wed, 19 Jun 2019 21:15:56 +0100 Subject: [PATCH 4/6] changelog --- changelog.d/5499.misc | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/5499.misc diff --git a/changelog.d/5499.misc b/changelog.d/5499.misc new file mode 100644 index 000000000..84de1f2da --- /dev/null +++ b/changelog.d/5499.misc @@ -0,0 +1 @@ +Some cleanups and sanity-checking in the CPU and database metrics. \ No newline at end of file From dc94773e600fef29cf88d304ffb1515b145aea13 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Mon, 24 Jun 2019 10:01:16 +0100 Subject: [PATCH 5/6] Avoid raising exceptions in metrics Sentry will catch the errors if they happen, so that should be good enough, and woun't make things explode if we hit the error condition. --- synapse/util/logcontext.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/synapse/util/logcontext.py b/synapse/util/logcontext.py index 10022ff62..6b0d2deea 100644 --- a/synapse/util/logcontext.py +++ b/synapse/util/logcontext.py @@ -374,20 +374,26 @@ class LoggingContext(object): # sanity check if utime_delta < 0: - raise ValueError("utime went backwards! %f < %f" % ( - current.ru_utime, self.usage_start.ru_utime, - )) + logger.error( + "utime went backwards! %f < %f", + current.ru_utime, + self.usage_start.ru_utime, + ) + utime_delta = 0 if stime_delta < 0: - raise ValueError("stime went backwards! %f < %f" % ( - current.ru_stime, self.usage_start.ru_stime, - )) + logger.error( + "stime went backwards! %f < %f", + current.ru_stime, + self.usage_start.ru_stime, + ) + stime_delta = 0 return utime_delta, stime_delta def add_database_transaction(self, duration_sec): if duration_sec < 0: - raise ValueError('DB txn time can only be non-negative') + raise ValueError("DB txn time can only be non-negative") self._resource_usage.db_txn_count += 1 self._resource_usage.db_txn_duration_sec += duration_sec @@ -399,7 +405,7 @@ class LoggingContext(object): connection """ if sched_sec < 0: - raise ValueError('DB scheduling time can only be non-negative') + raise ValueError("DB scheduling time can only be non-negative") self._resource_usage.db_sched_duration_sec += sched_sec def record_event_fetch(self, event_count): From 1793de6c6da6fc18fc5425dcf1818d8b007ec890 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Mon, 24 Jun 2019 11:16:13 +0100 Subject: [PATCH 6/6] black --- synapse/storage/_base.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/synapse/storage/_base.py b/synapse/storage/_base.py index cbd6568c4..b862daab2 100644 --- a/synapse/storage/_base.py +++ b/synapse/storage/_base.py @@ -363,14 +363,10 @@ class SQLBaseStore(object): duration = time_now - time_then ratio = (curr - prev) / duration - top_three_counters = self._txn_perf_counters.interval( - duration, limit=3 - ) + top_three_counters = self._txn_perf_counters.interval(duration, limit=3) perf_logger.info( - "Total database time: %.3f%% {%s}", - ratio * 100, - top_three_counters, + "Total database time: %.3f%% {%s}", ratio * 100, top_three_counters ) self._clock.looping_call(loop, 10000)