Merge pull request #5499 from matrix-org/rav/cleanup_metrics

Cleanups and sanity-checking in cpu and db metrics
This commit is contained in:
Richard van der Hoff 2019-06-24 17:12:54 +01:00 committed by GitHub
commit e59a8cd2e5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 53 additions and 29 deletions

1
changelog.d/5499.misc Normal file
View File

@ -0,0 +1 @@
Some cleanups and sanity-checking in the CPU and database metrics.

View File

@ -175,22 +175,22 @@ class PerformanceCounters(object):
self.current_counters = {}
self.previous_counters = {}
def update(self, key, start_time, end_time=None):
if end_time is None:
end_time = time.time()
duration = end_time - start_time
def update(self, key, duration_secs):
count, cum_time = self.current_counters.get(key, (0, 0))
count += 1
cum_time += duration
cum_time += duration_secs
self.current_counters[key] = (count, cum_time)
return end_time
def interval(self, interval_duration, limit=3):
def interval(self, interval_duration_secs, limit=3):
counters = []
for name, (count, cum_time) in iteritems(self.current_counters):
prev_count, prev_time = self.previous_counters.get(name, (0, 0))
counters.append(
((cum_time - prev_time) / interval_duration, count - prev_count, name)
(
(cum_time - prev_time) / interval_duration_secs,
count - prev_count,
name,
)
)
self.previous_counters = dict(self.current_counters)
@ -221,7 +221,6 @@ class SQLBaseStore(object):
# is running in mainline, and we have some nice monitoring frontends
# to watch it
self._txn_perf_counters = PerformanceCounters()
self._get_event_counters = PerformanceCounters()
self._get_event_cache = Cache(
"*getEvent*", keylen=3, max_entries=hs.config.event_cache_size
@ -369,21 +368,13 @@ class SQLBaseStore(object):
time_then = self._previous_loop_ts
self._previous_loop_ts = time_now
ratio = (curr - prev) / (time_now - time_then)
duration = time_now - time_then
ratio = (curr - prev) / duration
top_three_counters = self._txn_perf_counters.interval(
time_now - time_then, limit=3
)
top_3_event_counters = self._get_event_counters.interval(
time_now - time_then, limit=3
)
top_three_counters = self._txn_perf_counters.interval(duration, limit=3)
perf_logger.info(
"Total database time: %.3f%% {%s} {%s}",
ratio * 100,
top_three_counters,
top_3_event_counters,
"Total database time: %.3f%% {%s}", ratio * 100, top_three_counters
)
self._clock.looping_call(loop, 10000)
@ -465,7 +456,7 @@ class SQLBaseStore(object):
transaction_logger.debug("[TXN END] {%s} %f sec", name, duration)
self._current_txn_total_time += duration
self._txn_perf_counters.update(desc, start, end)
self._txn_perf_counters.update(desc, duration)
sql_txn_timer.labels(desc).observe(duration)
@defer.inlineCallbacks

View File

@ -336,10 +336,9 @@ class LoggingContext(object):
logger.warning("Called stop on logcontext %s without calling start", self)
return
usage_end = get_thread_resource_usage()
self._resource_usage.ru_utime += usage_end.ru_utime - self.usage_start.ru_utime
self._resource_usage.ru_stime += usage_end.ru_stime - self.usage_start.ru_stime
utime_delta, stime_delta = self._get_cputime()
self._resource_usage.ru_utime += utime_delta
self._resource_usage.ru_stime += stime_delta
self.usage_start = None
@ -357,13 +356,44 @@ class LoggingContext(object):
# can include resource usage so far.
is_main_thread = threading.current_thread() is self.main_thread
if self.alive and self.usage_start and is_main_thread:
current = get_thread_resource_usage()
res.ru_utime += current.ru_utime - self.usage_start.ru_utime
res.ru_stime += current.ru_stime - self.usage_start.ru_stime
utime_delta, stime_delta = self._get_cputime()
res.ru_utime += utime_delta
res.ru_stime += stime_delta
return res
def _get_cputime(self):
"""Get the cpu usage time so far
Returns: Tuple[float, float]: seconds in user mode, seconds in system mode
"""
current = get_thread_resource_usage()
utime_delta = current.ru_utime - self.usage_start.ru_utime
stime_delta = current.ru_stime - self.usage_start.ru_stime
# sanity check
if utime_delta < 0:
logger.error(
"utime went backwards! %f < %f",
current.ru_utime,
self.usage_start.ru_utime,
)
utime_delta = 0
if stime_delta < 0:
logger.error(
"stime went backwards! %f < %f",
current.ru_stime,
self.usage_start.ru_stime,
)
stime_delta = 0
return utime_delta, stime_delta
def add_database_transaction(self, duration_sec):
if duration_sec < 0:
raise ValueError("DB txn time can only be non-negative")
self._resource_usage.db_txn_count += 1
self._resource_usage.db_txn_duration_sec += duration_sec
@ -374,6 +404,8 @@ class LoggingContext(object):
sched_sec (float): number of seconds it took us to get a
connection
"""
if sched_sec < 0:
raise ValueError("DB scheduling time can only be non-negative")
self._resource_usage.db_sched_duration_sec += sched_sec
def record_event_fetch(self, event_count):