From 8ded8ba2c755f254bd98e15db7bc865eed997f07 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 1 Mar 2018 11:20:34 +0000 Subject: [PATCH 1/4] Make repl send_event idempotent and retry on timeouts If we treated timeouts as failures on the worker we would attempt to clean up e.g. push actions while the master might still process the event. --- synapse/replication/http/send_event.py | 44 ++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/synapse/replication/http/send_event.py b/synapse/replication/http/send_event.py index 439bfbb4f..73cd3d91d 100644 --- a/synapse/replication/http/send_event.py +++ b/synapse/replication/http/send_event.py @@ -15,10 +15,15 @@ from twisted.internet import defer -from synapse.api.errors import SynapseError, MatrixCodeMessageException +from synapse.api.errors import ( + SynapseError, MatrixCodeMessageException, CodeMessageException, +) from synapse.events import FrozenEvent from synapse.events.snapshot import EventContext from synapse.http.servlet import RestServlet, parse_json_object_from_request +from synapse.util.async import sleep +from synapse.util.caches.response_cache import ResponseCache +from synapse.util.logcontext import make_deferred_yieldable, preserve_fn from synapse.util.metrics import Measure from synapse.types import Requester @@ -43,7 +48,9 @@ def send_event_to_master(client, host, port, requester, event, context, ratelimit (bool) extra_users (list(str)): Any extra users to notify about event """ - uri = "http://%s:%s/_synapse/replication/send_event" % (host, port,) + uri = "http://%s:%s/_synapse/replication/send_event/%s" % ( + host, port, event.event_id, + ) payload = { "event": event.get_pdu_json(), @@ -56,7 +63,20 @@ def send_event_to_master(client, host, port, requester, event, context, } try: - result = yield client.post_json_get_json(uri, payload) + # We keep retrying the same request for timeouts. This is so that we + # have a good idea that the request has either succeeded or failed on + # the master, and so whether we should clean up or not. + while True: + try: + result = yield client.put_json(uri, payload) + break + except CodeMessageException as e: + if e.code != 504: + raise + + # If we timed out we probably don't need to worry about backing + # off too much, but lets just wait a little anyway. + yield sleep(1) except MatrixCodeMessageException as e: # We convert to SynapseError as we know that it was a SynapseError # on the master process that we should send to the client. (And @@ -71,7 +91,7 @@ class ReplicationSendEventRestServlet(RestServlet): The API looks like: - POST /_synapse/replication/send_event + POST /_synapse/replication/send_event/:event_id { "event": { .. serialized event .. }, @@ -83,7 +103,7 @@ class ReplicationSendEventRestServlet(RestServlet): "extra_users": [], } """ - PATTERNS = [re.compile("^/_synapse/replication/send_event$")] + PATTERNS = [re.compile("^/_synapse/replication/send_event/(?P[^/]+)$")] def __init__(self, hs): super(ReplicationSendEventRestServlet, self).__init__() @@ -92,8 +112,20 @@ class ReplicationSendEventRestServlet(RestServlet): self.store = hs.get_datastore() self.clock = hs.get_clock() + # The responses are tiny, so we may as well cache them for a while + self.response_cache = ResponseCache(hs, timeout_ms=30 * 60 * 1000) + + def on_PUT(self, request, event_id): + result = self.response_cache.get(event_id) + if not result: + result = self.response_cache.set( + event_id, + preserve_fn(self._handle_request)(request) + ) + return make_deferred_yieldable(result) + @defer.inlineCallbacks - def on_POST(self, request): + def _handle_request(self, request): with Measure(self.clock, "repl_send_event_parse"): content = parse_json_object_from_request(request) From 89f90d808ad923374a204377b0bc8ed15a9385a9 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 1 Mar 2018 11:59:16 +0000 Subject: [PATCH 2/4] Add some logging --- synapse/replication/http/send_event.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/synapse/replication/http/send_event.py b/synapse/replication/http/send_event.py index 73cd3d91d..f490622b6 100644 --- a/synapse/replication/http/send_event.py +++ b/synapse/replication/http/send_event.py @@ -72,6 +72,7 @@ def send_event_to_master(client, host, port, requester, event, context, break except CodeMessageException as e: if e.code != 504: + logger.warn("send_event request timed out") raise # If we timed out we probably don't need to worry about backing @@ -118,6 +119,7 @@ class ReplicationSendEventRestServlet(RestServlet): def on_PUT(self, request, event_id): result = self.response_cache.get(event_id) if not result: + logger.warn("Returning cached response") result = self.response_cache.set( event_id, preserve_fn(self._handle_request)(request) From 157298f9862bd06e9268aed9cdffbc1c912fd74d Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 1 Mar 2018 11:59:45 +0000 Subject: [PATCH 3/4] Don't do preserve_fn for every request --- synapse/replication/http/send_event.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/synapse/replication/http/send_event.py b/synapse/replication/http/send_event.py index f490622b6..665a56d2e 100644 --- a/synapse/replication/http/send_event.py +++ b/synapse/replication/http/send_event.py @@ -122,10 +122,11 @@ class ReplicationSendEventRestServlet(RestServlet): logger.warn("Returning cached response") result = self.response_cache.set( event_id, - preserve_fn(self._handle_request)(request) + self._handle_request(request) ) return make_deferred_yieldable(result) + @preserve_fn @defer.inlineCallbacks def _handle_request(self, request): with Measure(self.clock, "repl_send_event_parse"): From 126b9bf96f2c989e7402b5e0177ee39a2a20940e Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 1 Mar 2018 12:05:33 +0000 Subject: [PATCH 4/4] Log in the correct places --- synapse/replication/http/send_event.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/synapse/replication/http/send_event.py b/synapse/replication/http/send_event.py index 665a56d2e..70f2fe456 100644 --- a/synapse/replication/http/send_event.py +++ b/synapse/replication/http/send_event.py @@ -72,9 +72,10 @@ def send_event_to_master(client, host, port, requester, event, context, break except CodeMessageException as e: if e.code != 504: - logger.warn("send_event request timed out") raise + logger.warn("send_event request timed out") + # If we timed out we probably don't need to worry about backing # off too much, but lets just wait a little anyway. yield sleep(1) @@ -119,11 +120,12 @@ class ReplicationSendEventRestServlet(RestServlet): def on_PUT(self, request, event_id): result = self.response_cache.get(event_id) if not result: - logger.warn("Returning cached response") result = self.response_cache.set( event_id, self._handle_request(request) ) + else: + logger.warn("Returning cached response") return make_deferred_yieldable(result) @preserve_fn