Fix stack overflow in _PerHostRatelimiter due to synchronous requests (#14812)

When there are many synchronous requests waiting on a
`_PerHostRatelimiter`, each request will be started recursively just
after the previous request has completed. Under the right conditions,
this leads to stack exhaustion.

A common way for requests to become synchronous is when the remote
client disconnects early, because the homeserver is overloaded and slow
to respond.

Avoid stack exhaustion under these conditions by deferring subsequent
requests until the next reactor tick.

Fixes #14480.

Signed-off-by: Sean Quah <seanq@matrix.org>
This commit is contained in:
Sean Quah 2023-01-13 00:16:21 +00:00 committed by GitHub
parent 12083d37a8
commit 772e8c2385
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 70 additions and 12 deletions

View file

@ -13,6 +13,7 @@
# limitations under the License.
from typing import Optional
from twisted.internet import defer
from twisted.internet.defer import Deferred
from synapse.config.homeserver import HomeServerConfig
@ -29,7 +30,7 @@ class FederationRateLimiterTestCase(TestCase):
"""A simple test with the default values"""
reactor, clock = get_clock()
rc_config = build_rc_config()
ratelimiter = FederationRateLimiter(clock, rc_config)
ratelimiter = FederationRateLimiter(reactor, clock, rc_config)
with ratelimiter.ratelimit("testhost") as d1:
# shouldn't block
@ -39,7 +40,7 @@ class FederationRateLimiterTestCase(TestCase):
"""Test what happens when we hit the concurrent limit"""
reactor, clock = get_clock()
rc_config = build_rc_config({"rc_federation": {"concurrent": 2}})
ratelimiter = FederationRateLimiter(clock, rc_config)
ratelimiter = FederationRateLimiter(reactor, clock, rc_config)
with ratelimiter.ratelimit("testhost") as d1:
# shouldn't block
@ -57,6 +58,7 @@ class FederationRateLimiterTestCase(TestCase):
# ... until we complete an earlier request
cm2.__exit__(None, None, None)
reactor.advance(0.0)
self.successResultOf(d3)
def test_sleep_limit(self) -> None:
@ -65,7 +67,7 @@ class FederationRateLimiterTestCase(TestCase):
rc_config = build_rc_config(
{"rc_federation": {"sleep_limit": 2, "sleep_delay": 500}}
)
ratelimiter = FederationRateLimiter(clock, rc_config)
ratelimiter = FederationRateLimiter(reactor, clock, rc_config)
with ratelimiter.ratelimit("testhost") as d1:
# shouldn't block
@ -81,6 +83,43 @@ class FederationRateLimiterTestCase(TestCase):
sleep_time = _await_resolution(reactor, d3)
self.assertAlmostEqual(sleep_time, 500, places=3)
def test_lots_of_queued_things(self) -> None:
"""Tests lots of synchronous things queued up behind a slow thing.
The stack should *not* explode when the slow thing completes.
"""
reactor, clock = get_clock()
rc_config = build_rc_config(
{
"rc_federation": {
"sleep_limit": 1000000000, # never sleep
"reject_limit": 1000000000, # never reject requests
"concurrent": 1,
}
}
)
ratelimiter = FederationRateLimiter(reactor, clock, rc_config)
with ratelimiter.ratelimit("testhost") as d:
# shouldn't block
self.successResultOf(d)
async def task() -> None:
with ratelimiter.ratelimit("testhost") as d:
await d
for _ in range(1, 100):
defer.ensureDeferred(task())
last_task = defer.ensureDeferred(task())
# Upon exiting the context manager, all the synchronous things will resume.
# If a stack overflow occurs, the final task will not complete.
# Wait for all the things to complete.
reactor.advance(0.0)
self.successResultOf(last_task)
def _await_resolution(reactor: ThreadedMemoryReactorClock, d: Deferred) -> float:
"""advance the clock until the deferred completes.