Start fewer opentracing spans (#8640)

#8567 started a span for every background process. This is good as it means all Synapse code that gets run should be in a span (unless in the sentinel logging context), but it means we generate about 15x the number of spans as we did previously.

This PR attempts to reduce that number by a) not starting one for send commands to Redis, and b) deferring starting background processes until after we're sure they're necessary.

I don't really know how much this will help.
This commit is contained in:
Erik Johnston 2020-10-26 09:30:19 +00:00 committed by GitHub
parent 34a5696f93
commit 2b7c180879
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 96 additions and 53 deletions

1
changelog.d/8640.misc Normal file
View File

@ -0,0 +1 @@
Reduce number of OpenTracing spans started.

View File

@ -14,7 +14,7 @@
# limitations under the License. # limitations under the License.
import logging import logging
from typing import Dict, List, Optional from typing import Dict, List, Optional, Union
from prometheus_client import Counter from prometheus_client import Counter
@ -30,7 +30,10 @@ from synapse.metrics import (
event_processing_loop_counter, event_processing_loop_counter,
event_processing_loop_room_count, event_processing_loop_room_count,
) )
from synapse.metrics.background_process_metrics import run_as_background_process from synapse.metrics.background_process_metrics import (
run_as_background_process,
wrap_as_background_process,
)
from synapse.types import Collection, JsonDict, RoomStreamToken, UserID from synapse.types import Collection, JsonDict, RoomStreamToken, UserID
from synapse.util.metrics import Measure from synapse.util.metrics import Measure
@ -53,7 +56,7 @@ class ApplicationServicesHandler:
self.current_max = 0 self.current_max = 0
self.is_processing = False self.is_processing = False
async def notify_interested_services(self, max_token: RoomStreamToken): def notify_interested_services(self, max_token: RoomStreamToken):
"""Notifies (pushes) all application services interested in this event. """Notifies (pushes) all application services interested in this event.
Pushing is done asynchronously, so this method won't block for any Pushing is done asynchronously, so this method won't block for any
@ -72,6 +75,12 @@ class ApplicationServicesHandler:
if self.is_processing: if self.is_processing:
return return
# We only start a new background process if necessary rather than
# optimistically (to cut down on overhead).
self._notify_interested_services(max_token)
@wrap_as_background_process("notify_interested_services")
async def _notify_interested_services(self, max_token: RoomStreamToken):
with Measure(self.clock, "notify_interested_services"): with Measure(self.clock, "notify_interested_services"):
self.is_processing = True self.is_processing = True
try: try:
@ -166,8 +175,11 @@ class ApplicationServicesHandler:
finally: finally:
self.is_processing = False self.is_processing = False
async def notify_interested_services_ephemeral( def notify_interested_services_ephemeral(
self, stream_key: str, new_token: Optional[int], users: Collection[UserID] = [], self,
stream_key: str,
new_token: Optional[int],
users: Collection[Union[str, UserID]] = [],
): ):
"""This is called by the notifier in the background """This is called by the notifier in the background
when a ephemeral event handled by the homeserver. when a ephemeral event handled by the homeserver.
@ -183,13 +195,34 @@ class ApplicationServicesHandler:
new_token: The latest stream token new_token: The latest stream token
users: The user(s) involved with the event. users: The user(s) involved with the event.
""" """
if not self.notify_appservices:
return
if stream_key not in ("typing_key", "receipt_key", "presence_key"):
return
services = [ services = [
service service
for service in self.store.get_app_services() for service in self.store.get_app_services()
if service.supports_ephemeral if service.supports_ephemeral
] ]
if not services or not self.notify_appservices: if not services:
return return
# We only start a new background process if necessary rather than
# optimistically (to cut down on overhead).
self._notify_interested_services_ephemeral(
services, stream_key, new_token, users
)
@wrap_as_background_process("notify_interested_services_ephemeral")
async def _notify_interested_services_ephemeral(
self,
services: List[ApplicationService],
stream_key: str,
new_token: Optional[int],
users: Collection[Union[str, UserID]],
):
logger.info("Checking interested services for %s" % (stream_key)) logger.info("Checking interested services for %s" % (stream_key))
with Measure(self.clock, "notify_interested_services_ephemeral"): with Measure(self.clock, "notify_interested_services_ephemeral"):
for service in services: for service in services:
@ -237,7 +270,7 @@ class ApplicationServicesHandler:
return receipts return receipts
async def _handle_presence( async def _handle_presence(
self, service: ApplicationService, users: Collection[UserID] self, service: ApplicationService, users: Collection[Union[str, UserID]]
): ):
events = [] # type: List[JsonDict] events = [] # type: List[JsonDict]
presence_source = self.event_sources.sources["presence"] presence_source = self.event_sources.sources["presence"]
@ -245,6 +278,9 @@ class ApplicationServicesHandler:
service, "presence" service, "presence"
) )
for user in users: for user in users:
if isinstance(user, str):
user = UserID.from_string(user)
interested = await service.is_interested_in_presence(user, self.store) interested = await service.is_interested_in_presence(user, self.store)
if not interested: if not interested:
continue continue

View File

@ -317,7 +317,7 @@ def ensure_active_span(message, ret=None):
@contextlib.contextmanager @contextlib.contextmanager
def _noop_context_manager(*args, **kwargs): def noop_context_manager(*args, **kwargs):
"""Does exactly what it says on the tin""" """Does exactly what it says on the tin"""
yield yield
@ -413,7 +413,7 @@ def start_active_span(
""" """
if opentracing is None: if opentracing is None:
return _noop_context_manager() return noop_context_manager()
return opentracing.tracer.start_active_span( return opentracing.tracer.start_active_span(
operation_name, operation_name,
@ -428,7 +428,7 @@ def start_active_span(
def start_active_span_follows_from(operation_name, contexts): def start_active_span_follows_from(operation_name, contexts):
if opentracing is None: if opentracing is None:
return _noop_context_manager() return noop_context_manager()
references = [opentracing.follows_from(context) for context in contexts] references = [opentracing.follows_from(context) for context in contexts]
scope = start_active_span(operation_name, references=references) scope = start_active_span(operation_name, references=references)
@ -459,7 +459,7 @@ def start_active_span_from_request(
# Also, twisted uses byte arrays while opentracing expects strings. # Also, twisted uses byte arrays while opentracing expects strings.
if opentracing is None: if opentracing is None:
return _noop_context_manager() return noop_context_manager()
header_dict = { header_dict = {
k.decode(): v[0].decode() for k, v in request.requestHeaders.getAllRawHeaders() k.decode(): v[0].decode() for k, v in request.requestHeaders.getAllRawHeaders()
@ -497,7 +497,7 @@ def start_active_span_from_edu(
""" """
if opentracing is None: if opentracing is None:
return _noop_context_manager() return noop_context_manager()
carrier = json_decoder.decode(edu_content.get("context", "{}")).get( carrier = json_decoder.decode(edu_content.get("context", "{}")).get(
"opentracing", {} "opentracing", {}

View File

@ -24,7 +24,7 @@ from prometheus_client.core import REGISTRY, Counter, Gauge
from twisted.internet import defer from twisted.internet import defer
from synapse.logging.context import LoggingContext, PreserveLoggingContext from synapse.logging.context import LoggingContext, PreserveLoggingContext
from synapse.logging.opentracing import start_active_span from synapse.logging.opentracing import noop_context_manager, start_active_span
if TYPE_CHECKING: if TYPE_CHECKING:
import resource import resource
@ -167,7 +167,7 @@ class _BackgroundProcess:
) )
def run_as_background_process(desc: str, func, *args, **kwargs): def run_as_background_process(desc: str, func, *args, bg_start_span=True, **kwargs):
"""Run the given function in its own logcontext, with resource metrics """Run the given function in its own logcontext, with resource metrics
This should be used to wrap processes which are fired off to run in the This should be used to wrap processes which are fired off to run in the
@ -181,6 +181,9 @@ def run_as_background_process(desc: str, func, *args, **kwargs):
Args: Args:
desc: a description for this background process type desc: a description for this background process type
func: a function, which may return a Deferred or a coroutine func: a function, which may return a Deferred or a coroutine
bg_start_span: Whether to start an opentracing span. Defaults to True.
Should only be disabled for processes that will not log to or tag
a span.
args: positional args for func args: positional args for func
kwargs: keyword args for func kwargs: keyword args for func
@ -199,7 +202,10 @@ def run_as_background_process(desc: str, func, *args, **kwargs):
with BackgroundProcessLoggingContext(desc) as context: with BackgroundProcessLoggingContext(desc) as context:
context.request = "%s-%i" % (desc, count) context.request = "%s-%i" % (desc, count)
try: try:
with start_active_span(desc, tags={"request_id": context.request}): ctx = noop_context_manager()
if bg_start_span:
ctx = start_active_span(desc, tags={"request_id": context.request})
with ctx:
result = func(*args, **kwargs) result = func(*args, **kwargs)
if inspect.isawaitable(result): if inspect.isawaitable(result):

View File

@ -40,7 +40,6 @@ from synapse.handlers.presence import format_user_presence_state
from synapse.logging.context import PreserveLoggingContext from synapse.logging.context import PreserveLoggingContext
from synapse.logging.utils import log_function from synapse.logging.utils import log_function
from synapse.metrics import LaterGauge from synapse.metrics import LaterGauge
from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.streams.config import PaginationConfig from synapse.streams.config import PaginationConfig
from synapse.types import ( from synapse.types import (
Collection, Collection,
@ -310,44 +309,37 @@ class Notifier:
""" """
# poke any interested application service. # poke any interested application service.
run_as_background_process( self._notify_app_services(max_room_stream_token)
"_notify_app_services", self._notify_app_services, max_room_stream_token self._notify_pusher_pool(max_room_stream_token)
)
run_as_background_process(
"_notify_pusher_pool", self._notify_pusher_pool, max_room_stream_token
)
if self.federation_sender: if self.federation_sender:
self.federation_sender.notify_new_events(max_room_stream_token) self.federation_sender.notify_new_events(max_room_stream_token)
async def _notify_app_services(self, max_room_stream_token: RoomStreamToken): def _notify_app_services(self, max_room_stream_token: RoomStreamToken):
try: try:
await self.appservice_handler.notify_interested_services( self.appservice_handler.notify_interested_services(max_room_stream_token)
max_room_stream_token
)
except Exception: except Exception:
logger.exception("Error notifying application services of event") logger.exception("Error notifying application services of event")
async def _notify_app_services_ephemeral( def _notify_app_services_ephemeral(
self, self,
stream_key: str, stream_key: str,
new_token: Union[int, RoomStreamToken], new_token: Union[int, RoomStreamToken],
users: Collection[UserID] = [], users: Collection[Union[str, UserID]] = [],
): ):
try: try:
stream_token = None stream_token = None
if isinstance(new_token, int): if isinstance(new_token, int):
stream_token = new_token stream_token = new_token
await self.appservice_handler.notify_interested_services_ephemeral( self.appservice_handler.notify_interested_services_ephemeral(
stream_key, stream_token, users stream_key, stream_token, users
) )
except Exception: except Exception:
logger.exception("Error notifying application services of event") logger.exception("Error notifying application services of event")
async def _notify_pusher_pool(self, max_room_stream_token: RoomStreamToken): def _notify_pusher_pool(self, max_room_stream_token: RoomStreamToken):
try: try:
await self._pusher_pool.on_new_notifications(max_room_stream_token) self._pusher_pool.on_new_notifications(max_room_stream_token)
except Exception: except Exception:
logger.exception("Error pusher pool of event") logger.exception("Error pusher pool of event")
@ -384,12 +376,8 @@ class Notifier:
self.notify_replication() self.notify_replication()
# Notify appservices # Notify appservices
run_as_background_process( self._notify_app_services_ephemeral(
"_notify_app_services_ephemeral", stream_key, new_token, users,
self._notify_app_services_ephemeral,
stream_key,
new_token,
users,
) )
def on_new_replication_data(self) -> None: def on_new_replication_data(self) -> None:

View File

@ -19,7 +19,10 @@ from typing import TYPE_CHECKING, Dict, Union
from prometheus_client import Gauge from prometheus_client import Gauge
from synapse.metrics.background_process_metrics import run_as_background_process from synapse.metrics.background_process_metrics import (
run_as_background_process,
wrap_as_background_process,
)
from synapse.push import PusherConfigException from synapse.push import PusherConfigException
from synapse.push.emailpusher import EmailPusher from synapse.push.emailpusher import EmailPusher
from synapse.push.httppusher import HttpPusher from synapse.push.httppusher import HttpPusher
@ -187,7 +190,7 @@ class PusherPool:
) )
await self.remove_pusher(p["app_id"], p["pushkey"], p["user_name"]) await self.remove_pusher(p["app_id"], p["pushkey"], p["user_name"])
async def on_new_notifications(self, max_token: RoomStreamToken): def on_new_notifications(self, max_token: RoomStreamToken):
if not self.pushers: if not self.pushers:
# nothing to do here. # nothing to do here.
return return
@ -201,6 +204,17 @@ class PusherPool:
# Nothing to do # Nothing to do
return return
# We only start a new background process if necessary rather than
# optimistically (to cut down on overhead).
self._on_new_notifications(max_token)
@wrap_as_background_process("on_new_notifications")
async def _on_new_notifications(self, max_token: RoomStreamToken):
# We just use the minimum stream ordering and ignore the vector clock
# component. This is safe to do as long as we *always* ignore the vector
# clock components.
max_stream_id = max_token.stream
prev_stream_id = self._last_room_stream_id_seen prev_stream_id = self._last_room_stream_id_seen
self._last_room_stream_id_seen = max_stream_id self._last_room_stream_id_seen = max_stream_id

View File

@ -166,7 +166,9 @@ class RedisSubscriber(txredisapi.SubscriberProtocol, AbstractConnection):
Args: Args:
cmd (Command) cmd (Command)
""" """
run_as_background_process("send-cmd", self._async_send_command, cmd) run_as_background_process(
"send-cmd", self._async_send_command, cmd, bg_start_span=False
)
async def _async_send_command(self, cmd: Command): async def _async_send_command(self, cmd: Command):
"""Encode a replication command and send it over our outbound connection""" """Encode a replication command and send it over our outbound connection"""

View File

@ -42,7 +42,6 @@ class AppServiceHandlerTestCase(unittest.TestCase):
hs.get_clock.return_value = MockClock() hs.get_clock.return_value = MockClock()
self.handler = ApplicationServicesHandler(hs) self.handler = ApplicationServicesHandler(hs)
@defer.inlineCallbacks
def test_notify_interested_services(self): def test_notify_interested_services(self):
interested_service = self._mkservice(is_interested=True) interested_service = self._mkservice(is_interested=True)
services = [ services = [
@ -62,14 +61,12 @@ class AppServiceHandlerTestCase(unittest.TestCase):
defer.succeed((0, [event])), defer.succeed((0, [event])),
defer.succeed((0, [])), defer.succeed((0, [])),
] ]
yield defer.ensureDeferred( self.handler.notify_interested_services(RoomStreamToken(None, 0))
self.handler.notify_interested_services(RoomStreamToken(None, 0))
)
self.mock_scheduler.submit_event_for_as.assert_called_once_with( self.mock_scheduler.submit_event_for_as.assert_called_once_with(
interested_service, event interested_service, event
) )
@defer.inlineCallbacks
def test_query_user_exists_unknown_user(self): def test_query_user_exists_unknown_user(self):
user_id = "@someone:anywhere" user_id = "@someone:anywhere"
services = [self._mkservice(is_interested=True)] services = [self._mkservice(is_interested=True)]
@ -83,12 +80,11 @@ class AppServiceHandlerTestCase(unittest.TestCase):
defer.succeed((0, [event])), defer.succeed((0, [event])),
defer.succeed((0, [])), defer.succeed((0, [])),
] ]
yield defer.ensureDeferred(
self.handler.notify_interested_services(RoomStreamToken(None, 0)) self.handler.notify_interested_services(RoomStreamToken(None, 0))
)
self.mock_as_api.query_user.assert_called_once_with(services[0], user_id) self.mock_as_api.query_user.assert_called_once_with(services[0], user_id)
@defer.inlineCallbacks
def test_query_user_exists_known_user(self): def test_query_user_exists_known_user(self):
user_id = "@someone:anywhere" user_id = "@someone:anywhere"
services = [self._mkservice(is_interested=True)] services = [self._mkservice(is_interested=True)]
@ -102,9 +98,9 @@ class AppServiceHandlerTestCase(unittest.TestCase):
defer.succeed((0, [event])), defer.succeed((0, [event])),
defer.succeed((0, [])), defer.succeed((0, [])),
] ]
yield defer.ensureDeferred(
self.handler.notify_interested_services(RoomStreamToken(None, 0)) self.handler.notify_interested_services(RoomStreamToken(None, 0))
)
self.assertFalse( self.assertFalse(
self.mock_as_api.query_user.called, self.mock_as_api.query_user.called,
"query_user called when it shouldn't have been.", "query_user called when it shouldn't have been.",