diff --git a/changelog.d/10134.misc b/changelog.d/10134.misc new file mode 100644 index 000000000..ce9702645 --- /dev/null +++ b/changelog.d/10134.misc @@ -0,0 +1 @@ +Improve OpenTracing for event persistence. diff --git a/synapse/api/auth.py b/synapse/api/auth.py index 26a3b3891..cf4333a92 100644 --- a/synapse/api/auth.py +++ b/synapse/api/auth.py @@ -207,7 +207,7 @@ class Auth: request.requester = user_id if user_id in self._force_tracing_for_users: - opentracing.set_tag(opentracing.tags.SAMPLING_PRIORITY, 1) + opentracing.force_tracing() opentracing.set_tag("authenticated_entity", user_id) opentracing.set_tag("user_id", user_id) opentracing.set_tag("appservice_id", app_service.id) @@ -260,7 +260,7 @@ class Auth: request.requester = requester if user_info.token_owner in self._force_tracing_for_users: - opentracing.set_tag(opentracing.tags.SAMPLING_PRIORITY, 1) + opentracing.force_tracing() opentracing.set_tag("authenticated_entity", user_info.token_owner) opentracing.set_tag("user_id", user_info.user_id) if device_id: diff --git a/synapse/logging/opentracing.py b/synapse/logging/opentracing.py index 5b4725e03..4f18792c9 100644 --- a/synapse/logging/opentracing.py +++ b/synapse/logging/opentracing.py @@ -168,7 +168,7 @@ import inspect import logging import re from functools import wraps -from typing import TYPE_CHECKING, Dict, List, Optional, Pattern, Type +from typing import TYPE_CHECKING, Collection, Dict, List, Optional, Pattern, Type import attr @@ -278,6 +278,10 @@ class SynapseTags: DB_TXN_ID = "db.txn_id" +class SynapseBaggage: + FORCE_TRACING = "synapse-force-tracing" + + # Block everything by default # A regex which matches the server_names to expose traces for. # None means 'block everything'. @@ -285,6 +289,8 @@ _homeserver_whitelist = None # type: Optional[Pattern[str]] # Util methods +Sentinel = object() + def only_if_tracing(func): """Executes the function only if we're tracing. Otherwise returns None.""" @@ -447,12 +453,28 @@ def start_active_span( ) -def start_active_span_follows_from(operation_name, contexts): +def start_active_span_follows_from( + operation_name: str, contexts: Collection, inherit_force_tracing=False +): + """Starts an active opentracing span, with additional references to previous spans + + Args: + operation_name: name of the operation represented by the new span + contexts: the previous spans to inherit from + inherit_force_tracing: if set, and any of the previous contexts have had tracing + forced, the new span will also have tracing forced. + """ if opentracing is None: return noop_context_manager() references = [opentracing.follows_from(context) for context in contexts] scope = start_active_span(operation_name, references=references) + + if inherit_force_tracing and any( + is_context_forced_tracing(ctx) for ctx in contexts + ): + force_tracing(scope.span) + return scope @@ -551,6 +573,10 @@ def start_active_span_from_edu( # Opentracing setters for tags, logs, etc +@only_if_tracing +def active_span(): + """Get the currently active span, if any""" + return opentracing.tracer.active_span @ensure_active_span("set a tag") @@ -571,6 +597,33 @@ def set_operation_name(operation_name): opentracing.tracer.active_span.set_operation_name(operation_name) +@only_if_tracing +def force_tracing(span=Sentinel) -> None: + """Force sampling for the active/given span and its children. + + Args: + span: span to force tracing for. By default, the active span. + """ + if span is Sentinel: + span = opentracing.tracer.active_span + if span is None: + logger.error("No active span in force_tracing") + return + + span.set_tag(opentracing.tags.SAMPLING_PRIORITY, 1) + + # also set a bit of baggage, so that we have a way of figuring out if + # it is enabled later + span.set_baggage_item(SynapseBaggage.FORCE_TRACING, "1") + + +def is_context_forced_tracing(span_context) -> bool: + """Check if sampling has been force for the given span context.""" + if span_context is None: + return False + return span_context.baggage.get(SynapseBaggage.FORCE_TRACING) is not None + + # Injection and extraction diff --git a/synapse/storage/persist_events.py b/synapse/storage/persist_events.py index c11f6c584..dc38942bb 100644 --- a/synapse/storage/persist_events.py +++ b/synapse/storage/persist_events.py @@ -18,6 +18,7 @@ import itertools import logging from collections import deque from typing import ( + Any, Awaitable, Callable, Collection, @@ -40,6 +41,7 @@ from twisted.internet import defer from synapse.api.constants import EventTypes, Membership from synapse.events import EventBase from synapse.events.snapshot import EventContext +from synapse.logging import opentracing from synapse.logging.context import PreserveLoggingContext, make_deferred_yieldable from synapse.metrics.background_process_metrics import run_as_background_process from synapse.storage.databases import Databases @@ -103,12 +105,18 @@ times_pruned_extremities = Counter( ) -@attr.s(auto_attribs=True, frozen=True, slots=True) +@attr.s(auto_attribs=True, slots=True) class _EventPersistQueueItem: events_and_contexts: List[Tuple[EventBase, EventContext]] backfilled: bool deferred: ObservableDeferred + parent_opentracing_span_contexts: List = [] + """A list of opentracing spans waiting for this batch""" + + opentracing_span_context: Any = None + """The opentracing span under which the persistence actually happened""" + _PersistResult = TypeVar("_PersistResult") @@ -171,9 +179,27 @@ class _EventPeristenceQueue(Generic[_PersistResult]): ) queue.append(end_item) + # add our events to the queue item end_item.events_and_contexts.extend(events_and_contexts) + + # also add our active opentracing span to the item so that we get a link back + span = opentracing.active_span() + if span: + end_item.parent_opentracing_span_contexts.append(span.context) + + # start a processor for the queue, if there isn't one already self._handle_queue(room_id) - return await make_deferred_yieldable(end_item.deferred.observe()) + + # wait for the queue item to complete + res = await make_deferred_yieldable(end_item.deferred.observe()) + + # add another opentracing span which links to the persist trace. + with opentracing.start_active_span_follows_from( + "persist_event_batch_complete", (end_item.opentracing_span_context,) + ): + pass + + return res def _handle_queue(self, room_id): """Attempts to handle the queue for a room if not already being handled. @@ -200,9 +226,17 @@ class _EventPeristenceQueue(Generic[_PersistResult]): queue = self._get_drainining_queue(room_id) for item in queue: try: - ret = await self._per_item_callback( - item.events_and_contexts, item.backfilled - ) + with opentracing.start_active_span_follows_from( + "persist_event_batch", + item.parent_opentracing_span_contexts, + inherit_force_tracing=True, + ) as scope: + if scope: + item.opentracing_span_context = scope.span.context + + ret = await self._per_item_callback( + item.events_and_contexts, item.backfilled + ) except Exception: with PreserveLoggingContext(): item.deferred.errback() @@ -252,6 +286,7 @@ class EventsPersistenceStorage: self._event_persist_queue = _EventPeristenceQueue(self._persist_event_batch) self._state_resolution_handler = hs.get_state_resolution_handler() + @opentracing.trace async def persist_events( self, events_and_contexts: Iterable[Tuple[EventBase, EventContext]], @@ -307,6 +342,7 @@ class EventsPersistenceStorage: self.main_store.get_room_max_token(), ) + @opentracing.trace async def persist_event( self, event: EventBase, context: EventContext, backfilled: bool = False ) -> Tuple[EventBase, PersistedEventPosition, RoomStreamToken]: