Instrument the federation/backfill part of /messages (#13489)

Instrument the federation/backfill part of `/messages` so it's easier to follow what's going on in Jaeger when viewing a trace.

Split out from https://github.com/matrix-org/synapse/pull/13440

Follow-up from https://github.com/matrix-org/synapse/pull/13368

Part of https://github.com/matrix-org/synapse/issues/13356
This commit is contained in:
Eric Eastwood 2022-08-16 12:39:40 -05:00 committed by GitHub
parent 5ace5d7b15
commit 0a4efbc1dd
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 220 additions and 33 deletions

View file

@ -59,7 +59,13 @@ from synapse.events import EventBase
from synapse.events.snapshot import EventContext
from synapse.federation.federation_client import InvalidResponseError
from synapse.logging.context import nested_logging_context
from synapse.logging.opentracing import trace
from synapse.logging.opentracing import (
SynapseTags,
set_tag,
start_active_span,
tag_args,
trace,
)
from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.replication.http.devices import ReplicationUserDevicesResyncRestServlet
from synapse.replication.http.federation import (
@ -410,6 +416,7 @@ class FederationEventHandler:
prev_member_event,
)
@trace
async def process_remote_join(
self,
origin: str,
@ -715,7 +722,7 @@ class FederationEventHandler:
@trace
async def _process_pulled_events(
self, origin: str, events: Iterable[EventBase], backfilled: bool
self, origin: str, events: Collection[EventBase], backfilled: bool
) -> None:
"""Process a batch of events we have pulled from a remote server
@ -730,6 +737,15 @@ class FederationEventHandler:
backfilled: True if this is part of a historical batch of events (inhibits
notification to clients, and validation of device keys.)
"""
set_tag(
SynapseTags.FUNC_ARG_PREFIX + "event_ids",
str([event.event_id for event in events]),
)
set_tag(
SynapseTags.FUNC_ARG_PREFIX + "event_ids.length",
str(len(events)),
)
set_tag(SynapseTags.FUNC_ARG_PREFIX + "backfilled", str(backfilled))
logger.debug(
"processing pulled backfilled=%s events=%s",
backfilled,
@ -753,6 +769,7 @@ class FederationEventHandler:
await self._process_pulled_event(origin, ev, backfilled=backfilled)
@trace
@tag_args
async def _process_pulled_event(
self, origin: str, event: EventBase, backfilled: bool
) -> None:
@ -854,6 +871,7 @@ class FederationEventHandler:
else:
raise
@trace
async def _compute_event_context_with_maybe_missing_prevs(
self, dest: str, event: EventBase
) -> EventContext:
@ -970,6 +988,8 @@ class FederationEventHandler:
event, state_ids_before_event=state_map, partial_state=partial_state
)
@trace
@tag_args
async def _get_state_ids_after_missing_prev_event(
self,
destination: str,
@ -1009,10 +1029,10 @@ class FederationEventHandler:
logger.debug("Fetching %i events from cache/store", len(desired_events))
have_events = await self._store.have_seen_events(room_id, desired_events)
missing_desired_events = desired_events - have_events
missing_desired_event_ids = desired_events - have_events
logger.debug(
"We are missing %i events (got %i)",
len(missing_desired_events),
len(missing_desired_event_ids),
len(have_events),
)
@ -1024,13 +1044,30 @@ class FederationEventHandler:
# already have a bunch of the state events. It would be nice if the
# federation api gave us a way of finding out which we actually need.
missing_auth_events = set(auth_event_ids) - have_events
missing_auth_events.difference_update(
await self._store.have_seen_events(room_id, missing_auth_events)
missing_auth_event_ids = set(auth_event_ids) - have_events
missing_auth_event_ids.difference_update(
await self._store.have_seen_events(room_id, missing_auth_event_ids)
)
logger.debug("We are also missing %i auth events", len(missing_auth_events))
logger.debug("We are also missing %i auth events", len(missing_auth_event_ids))
missing_events = missing_desired_events | missing_auth_events
missing_event_ids = missing_desired_event_ids | missing_auth_event_ids
set_tag(
SynapseTags.RESULT_PREFIX + "missing_auth_event_ids",
str(missing_auth_event_ids),
)
set_tag(
SynapseTags.RESULT_PREFIX + "missing_auth_event_ids.length",
str(len(missing_auth_event_ids)),
)
set_tag(
SynapseTags.RESULT_PREFIX + "missing_desired_event_ids",
str(missing_desired_event_ids),
)
set_tag(
SynapseTags.RESULT_PREFIX + "missing_desired_event_ids.length",
str(len(missing_desired_event_ids)),
)
# Making an individual request for each of 1000s of events has a lot of
# overhead. On the other hand, we don't really want to fetch all of the events
@ -1041,13 +1078,13 @@ class FederationEventHandler:
#
# TODO: might it be better to have an API which lets us do an aggregate event
# request
if (len(missing_events) * 10) >= len(auth_event_ids) + len(state_event_ids):
if (len(missing_event_ids) * 10) >= len(auth_event_ids) + len(state_event_ids):
logger.debug("Requesting complete state from remote")
await self._get_state_and_persist(destination, room_id, event_id)
else:
logger.debug("Fetching %i events from remote", len(missing_events))
logger.debug("Fetching %i events from remote", len(missing_event_ids))
await self._get_events_and_persist(
destination=destination, room_id=room_id, event_ids=missing_events
destination=destination, room_id=room_id, event_ids=missing_event_ids
)
# We now need to fill out the state map, which involves fetching the
@ -1104,6 +1141,14 @@ class FederationEventHandler:
event_id,
failed_to_fetch,
)
set_tag(
SynapseTags.RESULT_PREFIX + "failed_to_fetch",
str(failed_to_fetch),
)
set_tag(
SynapseTags.RESULT_PREFIX + "failed_to_fetch.length",
str(len(failed_to_fetch)),
)
if remote_event.is_state() and remote_event.rejected_reason is None:
state_map[
@ -1112,6 +1157,8 @@ class FederationEventHandler:
return state_map
@trace
@tag_args
async def _get_state_and_persist(
self, destination: str, room_id: str, event_id: str
) -> None:
@ -1133,6 +1180,7 @@ class FederationEventHandler:
destination=destination, room_id=room_id, event_ids=(event_id,)
)
@trace
async def _process_received_pdu(
self,
origin: str,
@ -1283,6 +1331,7 @@ class FederationEventHandler:
except Exception:
logger.exception("Failed to resync device for %s", sender)
@trace
async def _handle_marker_event(self, origin: str, marker_event: EventBase) -> None:
"""Handles backfilling the insertion event when we receive a marker
event that points to one.
@ -1414,6 +1463,8 @@ class FederationEventHandler:
return event_from_response
@trace
@tag_args
async def _get_events_and_persist(
self, destination: str, room_id: str, event_ids: Collection[str]
) -> None:
@ -1459,6 +1510,7 @@ class FederationEventHandler:
logger.info("Fetched %i events of %i requested", len(events), len(event_ids))
await self._auth_and_persist_outliers(room_id, events)
@trace
async def _auth_and_persist_outliers(
self, room_id: str, events: Iterable[EventBase]
) -> None:
@ -1477,6 +1529,16 @@ class FederationEventHandler:
"""
event_map = {event.event_id: event for event in events}
event_ids = event_map.keys()
set_tag(
SynapseTags.FUNC_ARG_PREFIX + "event_ids",
str(event_ids),
)
set_tag(
SynapseTags.FUNC_ARG_PREFIX + "event_ids.length",
str(len(event_ids)),
)
# filter out any events we have already seen. This might happen because
# the events were eagerly pushed to us (eg, during a room join), or because
# another thread has raced against us since we decided to request the event.
@ -1593,6 +1655,7 @@ class FederationEventHandler:
backfilled=True,
)
@trace
async def _check_event_auth(
self, origin: Optional[str], event: EventBase, context: EventContext
) -> None:
@ -1631,6 +1694,14 @@ class FederationEventHandler:
claimed_auth_events = await self._load_or_fetch_auth_events_for_event(
origin, event
)
set_tag(
SynapseTags.RESULT_PREFIX + "claimed_auth_events",
str([ev.event_id for ev in claimed_auth_events]),
)
set_tag(
SynapseTags.RESULT_PREFIX + "claimed_auth_events.length",
str(len(claimed_auth_events)),
)
# ... and check that the event passes auth at those auth events.
# https://spec.matrix.org/v1.3/server-server-api/#checks-performed-on-receipt-of-a-pdu:
@ -1728,6 +1799,7 @@ class FederationEventHandler:
)
context.rejected = RejectedReason.AUTH_ERROR
@trace
async def _maybe_kick_guest_users(self, event: EventBase) -> None:
if event.type != EventTypes.GuestAccess:
return
@ -1935,6 +2007,8 @@ class FederationEventHandler:
# instead we raise an AuthError, which will make the caller ignore it.
raise AuthError(code=HTTPStatus.FORBIDDEN, msg="Auth events could not be found")
@trace
@tag_args
async def _get_remote_auth_chain_for_event(
self, destination: str, room_id: str, event_id: str
) -> None:
@ -1963,6 +2037,7 @@ class FederationEventHandler:
await self._auth_and_persist_outliers(room_id, remote_auth_events)
@trace
async def _run_push_actions_and_persist_event(
self, event: EventBase, context: EventContext, backfilled: bool = False
) -> None:
@ -2071,8 +2146,17 @@ class FederationEventHandler:
self._message_handler.maybe_schedule_expiry(event)
if not backfilled: # Never notify for backfilled events
for event in events:
await self._notify_persisted_event(event, max_stream_token)
with start_active_span("notify_persisted_events"):
set_tag(
SynapseTags.RESULT_PREFIX + "event_ids",
str([ev.event_id for ev in events]),
)
set_tag(
SynapseTags.RESULT_PREFIX + "event_ids.length",
str(len(events)),
)
for event in events:
await self._notify_persisted_event(event, max_stream_token)
return max_stream_token.stream