synapse-product/synapse/metrics/__init__.py

# -*- coding: utf-8 -*-
# Copyright 2015, 2016 OpenMarket Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import functools
import time
import gc
import os
import platform
import attr

from prometheus_client import Gauge, Histogram, Counter
from prometheus_client.core import GaugeMetricFamily, REGISTRY

from twisted.internet import reactor


logger = logging.getLogger(__name__)

running_on_pypy = platform.python_implementation() == "PyPy"
all_metrics = []
all_collectors = []
all_gauges = {}


class RegistryProxy(object):

    def collect(self):
        for metric in REGISTRY.collect():
            if not metric.name.startswith("__"):
                yield metric


@attr.s(hash=True)
class LaterGauge(object):

    name = attr.ib()
    desc = attr.ib()
    labels = attr.ib(hash=False)
    caller = attr.ib()

    def collect(self):

        g = GaugeMetricFamily(self.name, self.desc, labels=self.labels)

        try:
            calls = self.caller()
        except Exception as e:
            print(e)
            logger.err()
            yield g

        if isinstance(calls, dict):
            for k, v in calls.items():
                g.add_metric(k, v)
        else:
            g.add_metric([], calls)

        yield g

    def __attrs_post_init__(self):
        self._register()

    def _register(self):
        if self.name in all_gauges.keys():
            logger.warning("%s already registered, reregistering" % (self.name,))
            REGISTRY.unregister(all_gauges.pop(self.name))

        REGISTRY.register(self)
        all_gauges[self.name] = self


#
# Detailed CPU metrics
#

class CPUMetrics(object):

    def __init__(self):
        ticks_per_sec = 100
        try:
            # Try and get the system config
            ticks_per_sec = os.sysconf('SC_CLK_TCK')
        except (ValueError, TypeError, AttributeError):
            pass

        self.ticks_per_sec = ticks_per_sec

    def collect(self):

        with open("/proc/self/stat") as s:
            line = s.read()
            raw_stats = line.split(") ", 1)[1].split(" ")

            user = GaugeMetricFamily("process_cpu_user_seconds_total", "")
            user.add_metric([], float(raw_stats[11]) / self.ticks_per_sec)
            yield user

            sys = GaugeMetricFamily("process_cpu_system_seconds_total", "")
            sys.add_metric([], float(raw_stats[12]) / self.ticks_per_sec)
            yield sys


REGISTRY.register(CPUMetrics())

#
# Python GC metrics
#

gc_unreachable = Gauge("python_gc_unreachable_total", "Unreachable GC objects", ["gen"])
gc_time = Histogram(
    "python_gc_time",
    "Time taken to GC (sec)",
    ["gen"],
    buckets=[0.0025, 0.005, 0.01, 0.025, 0.05, 0.10, 0.25, 0.50, 1.00, 2.50,
             5.00, 7.50, 15.00, 30.00, 45.00, 60.00],
)


class GCCounts(object):

    def collect(self):
        cm = GaugeMetricFamily("python_gc_counts", "GC cycle counts", labels=["gen"])
        for n, m in enumerate(gc.get_count()):
            cm.add_metric([str(n)], m)

        yield cm


REGISTRY.register(GCCounts())

#
# Twisted reactor metrics
#

tick_time = Histogram(
    "python_twisted_reactor_tick_time",
    "Tick time of the Twisted reactor (sec)",
    buckets=[0.001, 0.002, 0.005, 0.001, 0.005, 0.01. 0.025, 0.05, 0.1, 0.2,
             0.5, 1, 2, 5],
)
pending_calls_metric = Histogram(
    "python_twisted_reactor_pending_calls",
    "Pending calls",
    buckets=[1, 2, 5, 10, 25, 50, 100, 250, 500, 1000],
)

#
# Federation Metrics
#

sent_edus_counter = Counter("synapse_federation_client_sent_edus", "")

sent_transactions_counter = Counter("synapse_federation_client_sent_transactions", "")

events_processed_counter = Counter("synapse_federation_client_events_processed", "")

# Used to track where various components have processed in the event stream,
# e.g. federation sending, appservice sending, etc.
event_processing_positions = Gauge("synapse_event_processing_positions", "", ["name"])

# Used to track the current max events stream position
event_persisted_position = Gauge("synapse_event_persisted_position", "")

# Used to track the received_ts of the last event processed by various
# components
event_processing_last_ts = Gauge("synapse_event_processing_last_ts", "", ["name"])

# Used to track the lag processing events. This is the time difference
# between the last processed event's received_ts and the time it was
# finished being processed.
event_processing_lag = Gauge("synapse_event_processing_lag", "", ["name"])


def runUntilCurrentTimer(func):

    @functools.wraps(func)
    def f(*args, **kwargs):
        now = reactor.seconds()
        num_pending = 0

        # _newTimedCalls is one long list of *all* pending calls. Below loop
        # is based off of impl of reactor.runUntilCurrent
        for delayed_call in reactor._newTimedCalls:
            if delayed_call.time > now:
                break

            if delayed_call.delayed_time > 0:
                continue

            num_pending += 1

        num_pending += len(reactor.threadCallQueue)
        start = time.time()
        ret = func(*args, **kwargs)
        end = time.time()

        # record the amount of wallclock time spent running pending calls.
        # This is a proxy for the actual amount of time between reactor polls,
        # since about 25% of time is actually spent running things triggered by
        # I/O events, but that is harder to capture without rewriting half the
        # reactor.
        tick_time.observe(end - start)
        pending_calls_metric.observe(num_pending)

        if running_on_pypy:
            return ret

        # Check if we need to do a manual GC (since its been disabled), and do
        # one if necessary.
        threshold = gc.get_threshold()
        counts = gc.get_count()
        for i in (2, 1, 0):
            if threshold[i] < counts[i]:
                logger.info("Collecting gc %d", i)

                start = time.time()
                unreachable = gc.collect(i)
                end = time.time()

                gc_time.labels(i).observe(end - start)
                gc_unreachable.labels(i).set(unreachable)

        return ret

    return f


try:
    # Ensure the reactor has all the attributes we expect
    reactor.runUntilCurrent
    reactor._newTimedCalls
    reactor.threadCallQueue

    # runUntilCurrent is called when we have pending calls. It is called once
    # per iteratation after fd polling.
    reactor.runUntilCurrent = runUntilCurrentTimer(reactor.runUntilCurrent)

    # We manually run the GC each reactor tick so that we can get some metrics
    # about time spent doing GC,
    if not running_on_pypy:
        gc.disable()
except AttributeError:
    pass
An initial implementation of a 'metrics' instance, similar to a 'logger' for keeping counter stats on method calls 2015-02-24 13:03:39 -05:00			`# -- coding: utf-8 --`
copyrights 2016-01-06 23:26:29 -05:00			`# Copyright 2015, 2016 OpenMarket Ltd`
An initial implementation of a 'metrics' instance, similar to a 'logger' for keeping counter stats on method calls 2015-02-24 13:03:39 -05:00			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

Ensure that exceptions while rendering individual metrics don't stop others from being rendered anyway - especially useful for CallbackMetric 2015-03-04 12:13:09 -05:00			`import logging`
Add some metrics about the reactor 2015-08-13 06:38:59 -04:00			`import functools`
			`import time`
Manually run GC on reactor tick. This also adds a metric for amount of time spent in GC. 2016-05-09 05:13:25 -04:00			`import gc`
add back CPU metrics 2018-05-23 14:03:56 -04:00			`import os`
Don't disable GC when running on PyPy PyPy's incminimark GC can't be triggered manually. From what I observed there are no obvious issues with just letting it run normally. And unlike CPython, it actually returns unused RAM to the system. Signed-off-by: Vincent Breitmoser <look@my.amazin.horse> 2018-01-07 19:53:32 -05:00			`import platform`
replacing portions 2018-05-21 20:47:37 -04:00			`import attr`
Add some metrics about the reactor 2015-08-13 06:38:59 -04:00
replacing portions 2018-05-21 20:47:37 -04:00			`from prometheus_client import Gauge, Histogram, Counter`
more cleanup 2018-05-22 18:36:20 -04:00			`from prometheus_client.core import GaugeMetricFamily, REGISTRY`
Ensure that exceptions while rendering individual metrics don't stop others from being rendered anyway - especially useful for CallbackMetric 2015-03-04 12:13:09 -05:00
replacing portions 2018-05-21 20:47:37 -04:00			`from twisted.internet import reactor`
An initial implementation of a 'metrics' instance, similar to a 'logger' for keeping counter stats on method calls 2015-02-24 13:03:39 -05:00

Ensure that exceptions while rendering individual metrics don't stop others from being rendered anyway - especially useful for CallbackMetric 2015-03-04 12:13:09 -05:00			`logger = logging.getLogger(__name__)`

cleanup 2018-05-22 18:32:57 -04:00			`running_on_pypy = platform.python_implementation() == "PyPy"`
Change CacheMetrics to be quicker We change it so that each cache has an individual CacheMetric, instead of having one global CacheMetric. This means that when a cache tries to increment a counter it does not need to go through so many indirections. 2016-06-02 06:29:44 -04:00			`all_metrics = []`
A slightly neater way to manage metric collector functions 2016-10-19 12:54:09 -04:00			`all_collectors = []`
replacing portions 2018-05-21 20:47:37 -04:00			`all_gauges = {}`
An initial implementation of a 'metrics' instance, similar to a 'logger' for keeping counter stats on method calls 2015-02-24 13:03:39 -05:00
fixes 2018-05-22 17:28:23 -04:00
			`class RegistryProxy(object):`

			`def collect(self):`
			`for metric in REGISTRY.collect():`
			`if not metric.name.startswith("__"):`
			`yield metric`


replacing portions 2018-05-21 20:47:37 -04:00			`@attr.s(hash=True)`
			`class LaterGauge(object):`
An initial implementation of a 'metrics' instance, similar to a 'logger' for keeping counter stats on method calls 2015-02-24 13:03:39 -05:00
replacing portions 2018-05-21 20:47:37 -04:00			`name = attr.ib()`
			`desc = attr.ib()`
			`labels = attr.ib(hash=False)`
			`caller = attr.ib()`
Add GaugeMetric 2018-04-11 06:07:33 -04:00
replacing portions 2018-05-21 20:47:37 -04:00			`def collect(self):`
Create the concept of a cachecounter metric; generating two counters specific to caches 2015-03-04 10:47:23 -05:00
fixes 2018-05-22 17:28:23 -04:00			`g = GaugeMetricFamily(self.name, self.desc, labels=self.labels)`
Create the concept of a cachecounter metric; generating two counters specific to caches 2015-03-04 10:47:23 -05:00
replacing portions 2018-05-21 20:47:37 -04:00			`try:`
			`calls = self.caller()`
			`except Exception as e:`
			`print(e)`
			`logger.err()`
			`yield g`
An initial implementation of a 'metrics' instance, similar to a 'logger' for keeping counter stats on method calls 2015-02-24 13:03:39 -05:00
replacing portions 2018-05-21 20:47:37 -04:00			`if isinstance(calls, dict):`
			`for k, v in calls.items():`
			`g.add_metric(k, v)`
			`else:`
			`g.add_metric([], calls)`
Add metrics for psutil derived memory usage 2016-07-20 10:47:28 -04:00
replacing portions 2018-05-21 20:47:37 -04:00			`yield g`
Add metrics for psutil derived memory usage 2016-07-20 10:47:28 -04:00
Merge remote-tracking branch 'origin/develop' into 3218-official-prom 2018-05-22 11:50:26 -04:00			`def __attrs_post_init__(self):`
			`self._register()`
Add metrics for psutil derived memory usage 2016-07-20 10:47:28 -04:00
Merge remote-tracking branch 'origin/develop' into 3218-official-prom 2018-05-22 11:50:26 -04:00			`def _register(self):`
replacing portions 2018-05-21 20:47:37 -04:00			`if self.name in all_gauges.keys():`
Merge remote-tracking branch 'origin/develop' into 3218-official-prom 2018-05-22 11:50:26 -04:00			`logger.warning("%s already registered, reregistering" % (self.name,))`
replacing portions 2018-05-21 20:47:37 -04:00			`REGISTRY.unregister(all_gauges.pop(self.name))`
Use _ instead of . as a metric namespacing separator, for Prometheus 2015-03-06 11:18:21 -05:00
replacing portions 2018-05-21 20:47:37 -04:00			`REGISTRY.register(self)`
			`all_gauges[self.name] = self`
An initial implementation of a 'metrics' instance, similar to a 'logger' for keeping counter stats on method calls 2015-02-24 13:03:39 -05:00

add back CPU metrics 2018-05-23 14:03:56 -04:00			`#`
			`# Detailed CPU metrics`
			`#`

			`class CPUMetrics(object):`

			`def __init__(self):`
			`ticks_per_sec = 100`
			`try:`
			`# Try and get the system config`
			`ticks_per_sec = os.sysconf('SC_CLK_TCK')`
			`except (ValueError, TypeError, AttributeError):`
			`pass`

			`self.ticks_per_sec = ticks_per_sec`

			`def collect(self):`

			`with open("/proc/self/stat") as s:`
			`line = s.read()`
			`raw_stats = line.split(") ", 1)[1].split(" ")`

			`user = GaugeMetricFamily("process_cpu_user_seconds_total", "")`
			`user.add_metric([], float(raw_stats[11]) / self.ticks_per_sec)`
			`yield user`

			`sys = GaugeMetricFamily("process_cpu_system_seconds_total", "")`
			`sys.add_metric([], float(raw_stats[12]) / self.ticks_per_sec)`
			`yield sys`

pepeightttt 2018-05-23 14:08:59 -04:00
add back CPU metrics 2018-05-23 14:03:56 -04:00			`REGISTRY.register(CPUMetrics())`

replacing portions 2018-05-21 20:47:37 -04:00			`#`
			`# Python GC metrics`
			`#`
An initial implementation of a 'metrics' instance, similar to a 'logger' for keeping counter stats on method calls 2015-02-24 13:03:39 -05:00
replacing portions 2018-05-21 20:47:37 -04:00			`gc_unreachable = Gauge("python_gc_unreachable_total", "Unreachable GC objects", ["gen"])`
cleanup 2018-05-22 18:32:57 -04:00			`gc_time = Histogram(`
			`"python_gc_time",`
update metrics to be in seconds 2018-05-28 05:10:27 -04:00			`"Time taken to GC (sec)",`
cleanup 2018-05-22 18:32:57 -04:00			`["gen"],`
update metrics to be in seconds 2018-05-28 05:10:27 -04:00			`buckets=[0.0025, 0.005, 0.01, 0.025, 0.05, 0.10, 0.25, 0.50, 1.00, 2.50,`
			`5.00, 7.50, 15.00, 30.00, 45.00, 60.00],`
cleanup 2018-05-22 18:32:57 -04:00			`)`

Provide some process resource usage metrics 2015-03-05 11:15:21 -05:00
replacing portions 2018-05-21 20:47:37 -04:00			`class GCCounts(object):`
cleanup 2018-05-22 18:32:57 -04:00
replacing portions 2018-05-21 20:47:37 -04:00			`def collect(self):`
			`cm = GaugeMetricFamily("python_gc_counts", "GC cycle counts", labels=["gen"])`
			`for n, m in enumerate(gc.get_count()):`
			`cm.add_metric([str(n)], m)`
Remember to emit final linefeed from /metrics page, or Prometheus gets upset 2015-03-06 14:08:47 -05:00
replacing portions 2018-05-21 20:47:37 -04:00			`yield cm`
Provide some process resource usage metrics 2015-03-05 11:15:21 -05:00
cleanup 2018-05-22 18:32:57 -04:00
replacing portions 2018-05-21 20:47:37 -04:00			`REGISTRY.register(GCCounts())`
Provide some process resource usage metrics 2015-03-05 11:15:21 -05:00
replacing portions 2018-05-21 20:47:37 -04:00			`#`
			`# Twisted reactor metrics`
			`#`
Rename the python-specific metrics now the docs claim that we have done 2016-11-03 13:03:52 -04:00
cleanup 2018-05-22 18:32:57 -04:00			`tick_time = Histogram(`
			`"python_twisted_reactor_tick_time",`
update metrics to be in seconds 2018-05-28 05:10:27 -04:00			`"Tick time of the Twisted reactor (sec)",`
			`buckets=[0.001, 0.002, 0.005, 0.001, 0.005, 0.01. 0.025, 0.05, 0.1, 0.2,`
			`0.5, 1, 2, 5],`
cleanup 2018-05-22 18:32:57 -04:00			`)`
			`pending_calls_metric = Histogram(`
			`"python_twisted_reactor_pending_calls",`
			`"Pending calls",`
			`buckets=[1, 2, 5, 10, 25, 50, 100, 250, 500, 1000],`
			`)`
Add some metrics about the reactor 2015-08-13 06:38:59 -04:00
replacing portions 2018-05-21 20:47:37 -04:00			`#`
			`# Federation Metrics`
			`#`
Add some metrics about the reactor 2015-08-13 06:38:59 -04:00
replacing portions 2018-05-21 20:47:37 -04:00			`sent_edus_counter = Counter("synapse_federation_client_sent_edus", "")`
Record some more GC metrics 2016-06-07 11:51:01 -04:00
replacing portions 2018-05-21 20:47:37 -04:00			`sent_transactions_counter = Counter("synapse_federation_client_sent_transactions", "")`
Set up the process collector during metrics __init__; that way all split-process workers have it 2016-10-27 13:09:34 -04:00
replacing portions 2018-05-21 20:47:37 -04:00			`events_processed_counter = Counter("synapse_federation_client_events_processed", "")`
Track where event stream processing have gotten up to 2018-04-11 06:07:51 -04:00
			`# Used to track where various components have processed in the event stream,`
			`# e.g. federation sending, appservice sending, etc.`
replacing portions 2018-05-21 20:47:37 -04:00			`event_processing_positions = Gauge("synapse_event_processing_positions", "", ["name"])`
Track where event stream processing have gotten up to 2018-04-11 06:07:51 -04:00
			`# Used to track the current max events stream position`
replacing portions 2018-05-21 20:47:37 -04:00			`event_persisted_position = Gauge("synapse_event_persisted_position", "")`
Track where event stream processing have gotten up to 2018-04-11 06:07:51 -04:00
Track last processed event received_ts 2018-04-11 06:52:19 -04:00			`# Used to track the received_ts of the last event processed by various`
			`# components`
replacing portions 2018-05-21 20:47:37 -04:00			`event_processing_last_ts = Gauge("synapse_event_processing_last_ts", "", ["name"])`
Track last processed event received_ts 2018-04-11 06:52:19 -04:00
			`# Used to track the lag processing events. This is the time difference`
			`# between the last processed event's received_ts and the time it was`
			`# finished being processed.`
replacing portions 2018-05-21 20:47:37 -04:00			`event_processing_lag = Gauge("synapse_event_processing_lag", "", ["name"])`
Add some metrics about the reactor 2015-08-13 06:38:59 -04:00
cleanup 2018-05-22 18:32:57 -04:00
Add some metrics about the reactor 2015-08-13 06:38:59 -04:00			`def runUntilCurrentTimer(func):`

			`@functools.wraps(func)`
			`def f(args, *kwargs):`
Fix pending_calls metric to not lie 2015-08-14 10:42:52 -04:00			`now = reactor.seconds()`
			`num_pending = 0`

			`# _newTimedCalls is one long list of all pending calls. Below loop`
			`# is based off of impl of reactor.runUntilCurrent`
Use more helpful variable names 2015-08-18 06:47:00 -04:00			`for delayed_call in reactor._newTimedCalls:`
			`if delayed_call.time > now:`
Fix pending_calls metric to not lie 2015-08-14 10:42:52 -04:00			`break`

Use more helpful variable names 2015-08-18 06:47:00 -04:00			`if delayed_call.delayed_time > 0:`
Fix pending_calls metric to not lie 2015-08-14 10:42:52 -04:00			`continue`

			`num_pending += 1`

			`num_pending += len(reactor.threadCallQueue)`
update metrics to be in seconds 2018-05-28 05:10:27 -04:00			`start = time.time()`
Add some metrics about the reactor 2015-08-13 06:38:59 -04:00			`ret = func(args, *kwargs)`
update metrics to be in seconds 2018-05-28 05:10:27 -04:00			`end = time.time()`
Add some comments about the reactor tick time metric 2018-01-19 18:51:04 -05:00
			`# record the amount of wallclock time spent running pending calls.`
			`# This is a proxy for the actual amount of time between reactor polls,`
			`# since about 25% of time is actually spent running things triggered by`
			`# I/O events, but that is harder to capture without rewriting half the`
			`# reactor.`
replacing portions 2018-05-21 20:47:37 -04:00			`tick_time.observe(end - start)`
			`pending_calls_metric.observe(num_pending)`
Manually run GC on reactor tick. This also adds a metric for amount of time spent in GC. 2016-05-09 05:13:25 -04:00
Don't disable GC when running on PyPy PyPy's incminimark GC can't be triggered manually. From what I observed there are no obvious issues with just letting it run normally. And unlike CPython, it actually returns unused RAM to the system. Signed-off-by: Vincent Breitmoser <look@my.amazin.horse> 2018-01-07 19:53:32 -05:00			`if running_on_pypy:`
			`return ret`

Add a comment 2016-05-13 11:31:08 -04:00			`# Check if we need to do a manual GC (since its been disabled), and do`
			`# one if necessary.`
Manually run GC on reactor tick. This also adds a metric for amount of time spent in GC. 2016-05-09 05:13:25 -04:00			`threshold = gc.get_threshold()`
			`counts = gc.get_count()`
Also record number of unreachable objects 2016-06-07 08:40:22 -04:00			`for i in (2, 1, 0):`
Manually run GC on reactor tick. This also adds a metric for amount of time spent in GC. 2016-05-09 05:13:25 -04:00			`if threshold[i] < counts[i]:`
			`logger.info("Collecting gc %d", i)`
Count number of GC collects 2016-05-16 04:32:29 -04:00
update metrics to be in seconds 2018-05-28 05:10:27 -04:00			`start = time.time()`
Also record number of unreachable objects 2016-06-07 08:40:22 -04:00			`unreachable = gc.collect(i)`
update metrics to be in seconds 2018-05-28 05:10:27 -04:00			`end = time.time()`
Count number of GC collects 2016-05-16 04:32:29 -04:00
replacing portions 2018-05-21 20:47:37 -04:00			`gc_time.labels(i).observe(end - start)`
			`gc_unreachable.labels(i).set(unreachable)`
Manually run GC on reactor tick. This also adds a metric for amount of time spent in GC. 2016-05-09 05:13:25 -04:00
Add some metrics about the reactor 2015-08-13 06:38:59 -04:00			`return ret`

			`return f`


Also check for presence of 'threadCallQueue' in reactor 2015-08-18 06:51:08 -04:00			`try:`
			`# Ensure the reactor has all the attributes we expect`
			`reactor.runUntilCurrent`
			`reactor._newTimedCalls`
			`reactor.threadCallQueue`

Add some metrics about the reactor 2015-08-13 06:38:59 -04:00			`# runUntilCurrent is called when we have pending calls. It is called once`
			`# per iteratation after fd polling.`
			`reactor.runUntilCurrent = runUntilCurrentTimer(reactor.runUntilCurrent)`
Add a comment 2016-05-13 11:31:08 -04:00
			`# We manually run the GC each reactor tick so that we can get some metrics`
			`# about time spent doing GC,`
Don't disable GC when running on PyPy PyPy's incminimark GC can't be triggered manually. From what I observed there are no obvious issues with just letting it run normally. And unlike CPython, it actually returns unused RAM to the system. Signed-off-by: Vincent Breitmoser <look@my.amazin.horse> 2018-01-07 19:53:32 -05:00			`if not running_on_pypy:`
			`gc.disable()`
Also check for presence of 'threadCallQueue' in reactor 2015-08-18 06:51:08 -04:00			`except AttributeError:`
			`pass`