diff --git a/changelog.d/17499.bugfix b/changelog.d/17499.bugfix new file mode 100644 index 000000000..5cb7b3c30 --- /dev/null +++ b/changelog.d/17499.bugfix @@ -0,0 +1 @@ +Fix a bug introduced in v1.110.0 which caused `/keys/query` to return incomplete results, leading to high network activity and CPU usage on Matrix clients. diff --git a/synapse/handlers/e2e_keys.py b/synapse/handlers/e2e_keys.py index 668cec513..f78e66ad0 100644 --- a/synapse/handlers/e2e_keys.py +++ b/synapse/handlers/e2e_keys.py @@ -291,13 +291,20 @@ class E2eKeysHandler: # Only try and fetch keys for destinations that are not marked as # down. - filtered_destinations = await filter_destinations_by_retry_limiter( - remote_queries_not_in_cache.keys(), - self.clock, - self.store, - # Let's give an arbitrary grace period for those hosts that are - # only recently down - retry_due_within_ms=60 * 1000, + unfiltered_destinations = remote_queries_not_in_cache.keys() + filtered_destinations = set( + await filter_destinations_by_retry_limiter( + unfiltered_destinations, + self.clock, + self.store, + # Let's give an arbitrary grace period for those hosts that are + # only recently down + retry_due_within_ms=60 * 1000, + ) + ) + failures.update( + (dest, _NOT_READY_FOR_RETRY_FAILURE) + for dest in (unfiltered_destinations - filtered_destinations) ) await concurrently_execute( @@ -1641,6 +1648,9 @@ def _check_device_signature( raise SynapseError(400, "Invalid signature", Codes.INVALID_SIGNATURE) +_NOT_READY_FOR_RETRY_FAILURE = {"status": 503, "message": "Not ready for retry"} + + def _exception_to_failure(e: Exception) -> JsonDict: if isinstance(e, SynapseError): return {"status": e.code, "errcode": e.errcode, "message": str(e)} @@ -1649,7 +1659,7 @@ def _exception_to_failure(e: Exception) -> JsonDict: return {"status": e.code, "message": str(e)} if isinstance(e, NotRetryingDestination): - return {"status": 503, "message": "Not ready for retry"} + return _NOT_READY_FOR_RETRY_FAILURE # include ConnectionRefused and other errors # diff --git a/tests/handlers/test_e2e_keys.py b/tests/handlers/test_e2e_keys.py index 0e6352ff4..8a3dfdcf7 100644 --- a/tests/handlers/test_e2e_keys.py +++ b/tests/handlers/test_e2e_keys.py @@ -43,9 +43,7 @@ from tests.unittest import override_config class E2eKeysHandlerTestCase(unittest.HomeserverTestCase): def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer: self.appservice_api = mock.AsyncMock() - return self.setup_test_homeserver( - federation_client=mock.Mock(), application_service_api=self.appservice_api - ) + return self.setup_test_homeserver(application_service_api=self.appservice_api) def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: self.handler = hs.get_e2e_keys_handler() @@ -1224,6 +1222,61 @@ class E2eKeysHandlerTestCase(unittest.HomeserverTestCase): }, ) + def test_query_devices_remote_down(self) -> None: + """Tests that querying keys for a remote user on an unreachable server returns + results in the "failures" property + """ + + remote_user_id = "@test:other" + local_user_id = "@test:test" + + # The backoff code treats time zero as special + self.reactor.advance(5) + + self.hs.get_federation_http_client().agent.request = mock.AsyncMock( # type: ignore[method-assign] + side_effect=Exception("boop") + ) + + e2e_handler = self.hs.get_e2e_keys_handler() + + query_result = self.get_success( + e2e_handler.query_devices( + { + "device_keys": {remote_user_id: []}, + }, + timeout=10, + from_user_id=local_user_id, + from_device_id="some_device_id", + ) + ) + + self.assertEqual( + query_result["failures"], + { + "other": { + "message": "Failed to send request: Exception: boop", + "status": 503, + } + }, + ) + + # Do it again: we should hit the backoff + query_result = self.get_success( + e2e_handler.query_devices( + { + "device_keys": {remote_user_id: []}, + }, + timeout=10, + from_user_id=local_user_id, + from_device_id="some_device_id", + ) + ) + + self.assertEqual( + query_result["failures"], + {"other": {"message": "Not ready for retry", "status": 503}}, + ) + @parameterized.expand( [ # The remote homeserver's response indicates that this user has 0/1/2 devices.