2019-03-27 06:06:21 -04:00
|
|
|
#
|
2023-11-21 15:29:58 -05:00
|
|
|
# This file is licensed under the Affero General Public License (AGPL) version 3.
|
|
|
|
#
|
2024-01-23 06:26:48 -05:00
|
|
|
# Copyright 2017 Vector Creations Ltd
|
2023-11-21 15:29:58 -05:00
|
|
|
# Copyright (C) 2023 New Vector, Ltd
|
|
|
|
#
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU Affero General Public License as
|
|
|
|
# published by the Free Software Foundation, either version 3 of the
|
|
|
|
# License, or (at your option) any later version.
|
|
|
|
#
|
|
|
|
# See the GNU Affero General Public License for more details:
|
|
|
|
# <https://www.gnu.org/licenses/agpl-3.0.html>.
|
|
|
|
#
|
|
|
|
# Originally licensed under the Apache License, Version 2.0:
|
|
|
|
# <http://www.apache.org/licenses/LICENSE-2.0>.
|
|
|
|
#
|
|
|
|
# [This file includes modifications made by New Vector Limited]
|
2019-03-27 06:06:21 -04:00
|
|
|
#
|
|
|
|
#
|
2019-03-27 12:15:59 -04:00
|
|
|
import heapq
|
2023-10-18 12:26:01 -04:00
|
|
|
from collections import defaultdict
|
2022-02-08 11:03:08 -05:00
|
|
|
from typing import TYPE_CHECKING, Iterable, Optional, Tuple, Type, TypeVar, cast
|
2019-03-27 11:18:28 -04:00
|
|
|
|
|
|
|
import attr
|
|
|
|
|
2022-02-08 11:03:08 -05:00
|
|
|
from synapse.replication.tcp.streams._base import (
|
|
|
|
StreamRow,
|
|
|
|
StreamUpdateResult,
|
|
|
|
Token,
|
2023-10-23 11:57:30 -04:00
|
|
|
_StreamFromIdGen,
|
2022-02-08 11:03:08 -05:00
|
|
|
)
|
2019-03-27 06:06:21 -04:00
|
|
|
|
2020-10-28 08:11:45 -04:00
|
|
|
if TYPE_CHECKING:
|
|
|
|
from synapse.server import HomeServer
|
|
|
|
|
2019-03-27 11:18:28 -04:00
|
|
|
"""Handling of the 'events' replication stream
|
|
|
|
|
|
|
|
This stream contains rows of various types. Each row therefore contains a 'type'
|
|
|
|
identifier before the real data. For example::
|
|
|
|
|
2019-03-27 12:15:59 -04:00
|
|
|
RDATA events batch ["state", ["!room:id", "m.type", "", "$event:id"]]
|
2019-03-27 11:18:28 -04:00
|
|
|
RDATA events 12345 ["ev", ["$event:id", "!room:id", "m.type", null, null]]
|
|
|
|
|
|
|
|
An "ev" row is sent for each new event. The fields in the data part are:
|
|
|
|
|
|
|
|
* The new event id
|
|
|
|
* The room id for the event
|
|
|
|
* The type of the new event
|
|
|
|
* The state key of the event, for state events
|
|
|
|
* The event id of an event which is redacted by this event.
|
|
|
|
|
2019-03-27 12:15:59 -04:00
|
|
|
A "state" row is sent whenever the "current state" in a room changes. The fields in the
|
|
|
|
data part are:
|
|
|
|
|
|
|
|
* The room id for the state change
|
|
|
|
* The event type of the state which has changed
|
|
|
|
* The state_key of the state which has changed
|
|
|
|
* The event id of the new state
|
|
|
|
|
2023-10-18 12:26:01 -04:00
|
|
|
A "state-all" row is sent whenever the "current state" in a room changes, but there are
|
|
|
|
too many state updates for a particular room in the same update. This replaces any
|
|
|
|
"state" rows on a per-room basis. The fields in the data part are:
|
|
|
|
|
|
|
|
* The room id for the state changes
|
|
|
|
|
2019-03-27 11:18:28 -04:00
|
|
|
"""
|
|
|
|
|
2023-10-18 12:26:01 -04:00
|
|
|
# Any room with more than _MAX_STATE_UPDATES_PER_ROOM will send a EventsStreamAllStateRow
|
|
|
|
# instead of individual EventsStreamEventRow. This is predominantly useful when
|
|
|
|
# purging large rooms.
|
|
|
|
_MAX_STATE_UPDATES_PER_ROOM = 150
|
|
|
|
|
2019-03-27 11:18:28 -04:00
|
|
|
|
2022-01-13 08:49:28 -05:00
|
|
|
@attr.s(slots=True, frozen=True, auto_attribs=True)
|
2020-09-04 06:54:56 -04:00
|
|
|
class EventsStreamRow:
|
2019-03-27 11:18:28 -04:00
|
|
|
"""A parsed row from the events replication stream"""
|
2019-06-20 05:32:02 -04:00
|
|
|
|
2022-01-13 08:49:28 -05:00
|
|
|
type: str # the TypeId of one of the *EventsStreamRows
|
|
|
|
data: "BaseEventsStreamRow"
|
2019-03-27 11:18:28 -04:00
|
|
|
|
|
|
|
|
2022-02-08 11:03:08 -05:00
|
|
|
T = TypeVar("T", bound="BaseEventsStreamRow")
|
|
|
|
|
|
|
|
|
2020-09-04 06:54:56 -04:00
|
|
|
class BaseEventsStreamRow:
|
2019-03-27 11:18:28 -04:00
|
|
|
"""Base class for rows to be sent in the events stream.
|
|
|
|
|
|
|
|
Specifies how to identify, serialize and deserialize the different types.
|
|
|
|
"""
|
|
|
|
|
2020-07-09 09:52:58 -04:00
|
|
|
# Unique string that ids the type. Must be overridden in sub classes.
|
2021-07-15 06:02:43 -04:00
|
|
|
TypeId: str
|
2019-03-27 11:18:28 -04:00
|
|
|
|
|
|
|
@classmethod
|
2022-02-08 11:03:08 -05:00
|
|
|
def from_data(cls: Type[T], data: Iterable[Optional[str]]) -> T:
|
2019-03-27 11:18:28 -04:00
|
|
|
"""Parse the data from the replication stream into a row.
|
|
|
|
|
|
|
|
By default we just call the constructor with the data list as arguments
|
|
|
|
|
|
|
|
Args:
|
|
|
|
data: The value of the data object from the replication stream
|
|
|
|
"""
|
|
|
|
return cls(*data)
|
|
|
|
|
|
|
|
|
2022-01-13 08:49:28 -05:00
|
|
|
@attr.s(slots=True, frozen=True, auto_attribs=True)
|
2019-03-27 11:18:28 -04:00
|
|
|
class EventsStreamEventRow(BaseEventsStreamRow):
|
|
|
|
TypeId = "ev"
|
|
|
|
|
2022-01-13 08:49:28 -05:00
|
|
|
event_id: str
|
|
|
|
room_id: str
|
|
|
|
type: str
|
|
|
|
state_key: Optional[str]
|
|
|
|
redacts: Optional[str]
|
|
|
|
relates_to: Optional[str]
|
|
|
|
membership: Optional[str]
|
|
|
|
rejected: bool
|
2022-07-19 07:45:17 -04:00
|
|
|
outlier: bool
|
2019-03-27 11:18:28 -04:00
|
|
|
|
|
|
|
|
2022-01-13 08:49:28 -05:00
|
|
|
@attr.s(slots=True, frozen=True, auto_attribs=True)
|
2019-03-27 12:15:59 -04:00
|
|
|
class EventsStreamCurrentStateRow(BaseEventsStreamRow):
|
|
|
|
TypeId = "state"
|
|
|
|
|
2022-01-13 08:49:28 -05:00
|
|
|
room_id: str
|
|
|
|
type: str
|
|
|
|
state_key: str
|
|
|
|
event_id: Optional[str]
|
2019-03-27 12:15:59 -04:00
|
|
|
|
|
|
|
|
2023-10-18 12:26:01 -04:00
|
|
|
@attr.s(slots=True, frozen=True, auto_attribs=True)
|
|
|
|
class EventsStreamAllStateRow(BaseEventsStreamRow):
|
|
|
|
TypeId = "state-all"
|
|
|
|
|
|
|
|
room_id: str
|
|
|
|
|
|
|
|
|
2021-07-15 06:02:43 -04:00
|
|
|
_EventRows: Tuple[Type[BaseEventsStreamRow], ...] = (
|
2020-01-14 09:08:06 -05:00
|
|
|
EventsStreamEventRow,
|
|
|
|
EventsStreamCurrentStateRow,
|
2023-10-18 12:26:01 -04:00
|
|
|
EventsStreamAllStateRow,
|
2021-07-15 06:02:43 -04:00
|
|
|
)
|
2020-01-14 09:08:06 -05:00
|
|
|
|
|
|
|
TypeToRow = {Row.TypeId: Row for Row in _EventRows}
|
2019-03-27 06:06:21 -04:00
|
|
|
|
|
|
|
|
2023-10-23 11:57:30 -04:00
|
|
|
class EventsStream(_StreamFromIdGen):
|
2021-02-16 17:32:34 -05:00
|
|
|
"""We received a new event, or an event went from being an outlier to not"""
|
2019-06-20 05:32:02 -04:00
|
|
|
|
2019-03-27 06:06:21 -04:00
|
|
|
NAME = "events"
|
|
|
|
|
2020-10-28 08:11:45 -04:00
|
|
|
def __init__(self, hs: "HomeServer"):
|
2022-02-23 06:04:02 -05:00
|
|
|
self._store = hs.get_datastores().main
|
2020-04-17 09:49:55 -04:00
|
|
|
super().__init__(
|
2023-10-23 11:57:30 -04:00
|
|
|
hs.get_instance_name(), self._update_function, self._store._stream_id_gen
|
2020-04-17 09:49:55 -04:00
|
|
|
)
|
2019-03-27 11:18:28 -04:00
|
|
|
|
2020-04-17 09:49:55 -04:00
|
|
|
async def _update_function(
|
2020-05-01 12:19:56 -04:00
|
|
|
self,
|
|
|
|
instance_name: str,
|
|
|
|
from_token: Token,
|
|
|
|
current_token: Token,
|
|
|
|
target_row_count: int,
|
2020-04-23 13:19:08 -04:00
|
|
|
) -> StreamUpdateResult:
|
2023-10-30 10:47:57 -04:00
|
|
|
# The events stream cannot be "reset", so its safe to return early if
|
|
|
|
# the from token is larger than the current token (the DB query will
|
|
|
|
# trivially return 0 rows anyway).
|
|
|
|
if from_token >= current_token:
|
|
|
|
return [], current_token, False
|
|
|
|
|
2020-04-23 13:19:08 -04:00
|
|
|
# the events stream merges together three separate sources:
|
|
|
|
# * new events
|
|
|
|
# * current_state changes
|
|
|
|
# * events which were previously outliers, but have now been de-outliered.
|
|
|
|
#
|
|
|
|
# The merge operation is complicated by the fact that we only have a single
|
|
|
|
# "stream token" which is supposed to indicate how far we have got through
|
|
|
|
# all three streams. It's therefore no good to return rows 1-1000 from the
|
|
|
|
# "new events" table if the state_deltas are limited to rows 1-100 by the
|
|
|
|
# target_row_count.
|
|
|
|
#
|
|
|
|
# In other words: we must pick a new upper limit, and must return *all* rows
|
|
|
|
# up to that point for each of the three sources.
|
|
|
|
#
|
|
|
|
# Start by trying to split the target_row_count up. We expect to have a
|
|
|
|
# negligible number of ex-outliers, and a rough approximation based on recent
|
|
|
|
# traffic on sw1v.org shows that there are approximately the same number of
|
|
|
|
# event rows between a given pair of stream ids as there are state
|
|
|
|
# updates, so let's split our target_row_count among those two types. The target
|
|
|
|
# is only an approximation - it doesn't matter if we end up going a bit over it.
|
|
|
|
|
|
|
|
target_row_count //= 2
|
|
|
|
|
|
|
|
# now we fetch up to that many rows from the events table
|
|
|
|
|
2021-11-26 13:41:31 -05:00
|
|
|
event_rows = await self._store.get_all_new_forward_event_rows(
|
2020-10-09 08:10:33 -04:00
|
|
|
instance_name, from_token, current_token, target_row_count
|
2021-07-15 06:02:43 -04:00
|
|
|
)
|
2020-04-23 13:19:08 -04:00
|
|
|
|
|
|
|
# we rely on get_all_new_forward_event_rows strictly honouring the limit, so
|
|
|
|
# that we know it is safe to just take upper_limit = event_rows[-1][0].
|
|
|
|
assert (
|
|
|
|
len(event_rows) <= target_row_count
|
|
|
|
), "get_all_new_forward_event_rows did not honour row limit"
|
|
|
|
|
|
|
|
# if we hit the limit on event_updates, there's no point in going beyond the
|
|
|
|
# last stream_id in the batch for the other sources.
|
|
|
|
|
|
|
|
if len(event_rows) == target_row_count:
|
|
|
|
limited = True
|
2021-07-15 06:02:43 -04:00
|
|
|
upper_limit: int = event_rows[-1][0]
|
2020-04-23 13:19:08 -04:00
|
|
|
else:
|
|
|
|
limited = False
|
|
|
|
upper_limit = current_token
|
|
|
|
|
2020-04-29 07:30:36 -04:00
|
|
|
# next up is the state delta table.
|
|
|
|
(
|
|
|
|
state_rows,
|
|
|
|
upper_limit,
|
|
|
|
state_rows_limited,
|
|
|
|
) = await self._store.get_all_updated_current_state_deltas(
|
2020-10-09 08:10:33 -04:00
|
|
|
instance_name, from_token, upper_limit, target_row_count
|
2020-04-29 07:30:36 -04:00
|
|
|
)
|
2020-04-23 13:19:08 -04:00
|
|
|
|
2020-04-29 07:30:36 -04:00
|
|
|
limited = limited or state_rows_limited
|
2020-04-23 13:19:08 -04:00
|
|
|
|
|
|
|
# finally, fetch the ex-outliers rows. We assume there are few enough of these
|
|
|
|
# not to bother with the limit.
|
2019-03-27 12:15:59 -04:00
|
|
|
|
2021-11-26 13:41:31 -05:00
|
|
|
ex_outliers_rows = await self._store.get_ex_outlier_stream_rows(
|
2020-10-09 08:10:33 -04:00
|
|
|
instance_name, from_token, upper_limit
|
2021-07-15 06:02:43 -04:00
|
|
|
)
|
2019-03-27 12:15:59 -04:00
|
|
|
|
2020-04-23 13:19:08 -04:00
|
|
|
# we now need to turn the raw database rows returned into tuples suitable
|
|
|
|
# for the replication protocol (basically, we add an identifier to
|
|
|
|
# distinguish the row type). At the same time, we can limit the event_rows
|
|
|
|
# to the max stream_id from state_rows.
|
|
|
|
|
2021-07-15 06:02:43 -04:00
|
|
|
event_updates: Iterable[Tuple[int, Tuple]] = (
|
2020-04-23 13:19:08 -04:00
|
|
|
(stream_id, (EventsStreamEventRow.TypeId, rest))
|
|
|
|
for (stream_id, *rest) in event_rows
|
|
|
|
if stream_id <= upper_limit
|
2021-07-15 06:02:43 -04:00
|
|
|
)
|
2020-04-23 13:19:08 -04:00
|
|
|
|
2023-10-18 12:26:01 -04:00
|
|
|
# Separate out rooms that have many state updates, listeners should clear
|
|
|
|
# all state for those rooms.
|
|
|
|
state_updates_by_room = defaultdict(list)
|
|
|
|
for stream_id, room_id, _type, _state_key, _event_id in state_rows:
|
|
|
|
state_updates_by_room[room_id].append(stream_id)
|
|
|
|
|
|
|
|
state_all_rows = [
|
|
|
|
(stream_ids[-1], room_id)
|
|
|
|
for room_id, stream_ids in state_updates_by_room.items()
|
|
|
|
if len(stream_ids) >= _MAX_STATE_UPDATES_PER_ROOM
|
|
|
|
]
|
|
|
|
state_all_updates: Iterable[Tuple[int, Tuple]] = (
|
|
|
|
(max_stream_id, (EventsStreamAllStateRow.TypeId, (room_id,)))
|
|
|
|
for (max_stream_id, room_id) in state_all_rows
|
|
|
|
)
|
|
|
|
|
|
|
|
# Any remaining state updates are sent individually.
|
|
|
|
state_all_rooms = {room_id for _, room_id in state_all_rows}
|
2021-07-15 06:02:43 -04:00
|
|
|
state_updates: Iterable[Tuple[int, Tuple]] = (
|
2020-04-23 13:19:08 -04:00
|
|
|
(stream_id, (EventsStreamCurrentStateRow.TypeId, rest))
|
|
|
|
for (stream_id, *rest) in state_rows
|
2023-10-18 12:26:01 -04:00
|
|
|
if rest[0] not in state_all_rooms
|
2021-07-15 06:02:43 -04:00
|
|
|
)
|
2020-04-23 13:19:08 -04:00
|
|
|
|
2021-07-15 06:02:43 -04:00
|
|
|
ex_outliers_updates: Iterable[Tuple[int, Tuple]] = (
|
2020-04-23 13:19:08 -04:00
|
|
|
(stream_id, (EventsStreamEventRow.TypeId, rest))
|
|
|
|
for (stream_id, *rest) in ex_outliers_rows
|
2021-07-15 06:02:43 -04:00
|
|
|
)
|
2020-04-23 13:19:08 -04:00
|
|
|
|
|
|
|
# we need to return a sorted list, so merge them together.
|
2023-10-18 12:26:01 -04:00
|
|
|
updates = list(
|
|
|
|
heapq.merge(
|
|
|
|
event_updates, state_all_updates, state_updates, ex_outliers_updates
|
|
|
|
)
|
|
|
|
)
|
2020-04-23 13:19:08 -04:00
|
|
|
return updates, upper_limit, limited
|
2019-03-27 11:18:28 -04:00
|
|
|
|
|
|
|
@classmethod
|
2022-02-08 11:03:08 -05:00
|
|
|
def parse_row(cls, row: StreamRow) -> "EventsStreamRow":
|
|
|
|
(typ, data) = cast(Tuple[str, Iterable[Optional[str]]], row)
|
|
|
|
event_stream_row_data = TypeToRow[typ].from_data(data)
|
|
|
|
return EventsStreamRow(typ, event_stream_row_data)
|