From d64653d062a7fc27782e70c1ca581e85b7730e72 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Thu, 18 Aug 2022 10:05:07 -0500 Subject: [PATCH] Track number of hosts affected by the rate limiter (#13541) Track number of hosts affected by the rate limiter so we can differentiate one really noisy homeserver from a general ratelimit tuning problem across the federation. Follow-up to https://github.com/matrix-org/synapse/pull/13534 Part of https://github.com/matrix-org/synapse/issues/13356 --- changelog.d/13541.misc | 1 + synapse/util/ratelimitutils.py | 43 ++++++++++++++++++++++++++++++---- 2 files changed, 40 insertions(+), 4 deletions(-) create mode 100644 changelog.d/13541.misc diff --git a/changelog.d/13541.misc b/changelog.d/13541.misc new file mode 100644 index 000000000..b488bf74c --- /dev/null +++ b/changelog.d/13541.misc @@ -0,0 +1 @@ +Add metrics to track how the rate limiter is affecting requests (sleep/reject). diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index 434b02b97..724d39b92 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -30,7 +30,7 @@ from synapse.logging.context import ( run_in_background, ) from synapse.logging.opentracing import start_active_span -from synapse.metrics import Histogram +from synapse.metrics import Histogram, LaterGauge from synapse.util import Clock if typing.TYPE_CHECKING: @@ -74,6 +74,27 @@ class FederationRateLimiter: str, "_PerHostRatelimiter" ] = collections.defaultdict(new_limiter) + # We track the number of affected hosts per time-period so we can + # differentiate one really noisy homeserver from a general + # ratelimit tuning problem across the federation. + LaterGauge( + "synapse_rate_limit_sleep_affected_hosts", + "Number of hosts that had requests put to sleep", + [], + lambda: sum( + ratelimiter.should_sleep() for ratelimiter in self.ratelimiters.values() + ), + ) + LaterGauge( + "synapse_rate_limit_reject_affected_hosts", + "Number of hosts that had requests rejected", + [], + lambda: sum( + ratelimiter.should_reject() + for ratelimiter in self.ratelimiters.values() + ), + ) + def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]]": """Used to ratelimit an incoming request from a given host @@ -139,6 +160,21 @@ class _PerHostRatelimiter: finally: self._on_exit(request_id) + def should_reject(self) -> bool: + """ + Whether to reject the request if we already have too many queued up + (either sleeping or in the ready queue). + """ + queue_size = len(self.ready_request_queue) + len(self.sleeping_requests) + return queue_size > self.reject_limit + + def should_sleep(self) -> bool: + """ + Whether to sleep the request if we already have too many requests coming + through within the window. + """ + return len(self.request_times) > self.sleep_limit + def _on_enter(self, request_id: object) -> "defer.Deferred[None]": time_now = self.clock.time_msec() @@ -149,8 +185,7 @@ class _PerHostRatelimiter: # reject the request if we already have too many queued up (either # sleeping or in the ready queue). - queue_size = len(self.ready_request_queue) + len(self.sleeping_requests) - if queue_size > self.reject_limit: + if self.should_reject(): logger.debug("Ratelimiter(%s): rejecting request", self.host) rate_limit_reject_counter.inc() raise LimitExceededError( @@ -180,7 +215,7 @@ class _PerHostRatelimiter: len(self.request_times), ) - if len(self.request_times) > self.sleep_limit: + if self.should_sleep(): logger.debug( "Ratelimiter(%s) [%s]: sleeping request for %f sec", self.host,