From 82a2e63d235b6288afb960f8fa6f9b79e25ff24e Mon Sep 17 00:00:00 2001 From: David Teller Date: Thu, 24 Feb 2022 13:43:31 +0100 Subject: [PATCH] A room Protection designed to measure lag in a room (#217) --- package.json | 3 +- src/Mjolnir.ts | 16 +- src/commands/CommandHandler.ts | 6 +- src/commands/StatusCommand.ts | 40 +- src/protections/DetectFederationLag.ts | 742 ++++++++++++++++++++ src/protections/IProtection.ts | 37 +- src/protections/ProtectionSettings.ts | 65 +- src/protections/protections.ts | 8 +- test/integration/detectFederationLagTest.ts | 266 +++++++ test/integration/fixtures.ts | 2 + yarn.lock | 17 +- 11 files changed, 1166 insertions(+), 36 deletions(-) create mode 100644 src/protections/DetectFederationLag.ts create mode 100644 test/integration/detectFederationLagTest.ts diff --git a/package.json b/package.json index 607d32f..174f8bd 100644 --- a/package.json +++ b/package.json @@ -37,7 +37,8 @@ "html-to-text": "^8.0.0", "js-yaml": "^4.1.0", "jsdom": "^16.6.0", - "matrix-bot-sdk": "^0.5.19" + "matrix-bot-sdk": "^0.5.19", + "parse-duration": "^1.0.2" }, "engines": { "node": ">=14.0.0" diff --git a/src/Mjolnir.ts b/src/Mjolnir.ts index 168d5bc..1e13539 100644 --- a/src/Mjolnir.ts +++ b/src/Mjolnir.ts @@ -34,7 +34,7 @@ import { COMMAND_PREFIX, handleCommand } from "./commands/CommandHandler"; import { applyUserBans } from "./actions/ApplyBan"; import config from "./config"; import ErrorCache, { ERROR_KIND_FATAL, ERROR_KIND_PERMISSION } from "./ErrorCache"; -import { IProtection } from "./protections/IProtection"; +import { Protection } from "./protections/IProtection"; import { PROTECTIONS } from "./protections/protections"; import { ProtectionSettingValidationError } from "./protections/ProtectionSettings"; import { UnlistedUserRedactionQueue } from "./queues/UnlistedUserRedactionQueue"; @@ -67,7 +67,7 @@ export class Mjolnir { private displayName: string; private localpart: string; private currentState: string = STATE_NOT_STARTED; - public protections = new Map(); + public protections = new Map(); /** * This is for users who are not listed on a watchlist, * but have been flagged by the automatic spam detection as suispicous @@ -246,7 +246,7 @@ export class Mjolnir { return this.currentState; } - public get enabledProtections(): IProtection[] { + public get enabledProtections(): Protection[] { return [...this.protections.values()].filter(p => p.enabled); } @@ -464,7 +464,7 @@ export class Mjolnir { /* * Read org.matrix.mjolnir.setting state event, find any saved settings for * the requested protectionName, then iterate and validate against their parser - * counterparts in IProtection.settings and return those which validate + * counterparts in Protection.settings and return those which validate * * @param protectionName The name of the protection whose settings we're reading * @returns Every saved setting for this protectionName that has a valid value @@ -543,7 +543,7 @@ export class Mjolnir { * * @param protection The protection object we want to register */ - public async registerProtection(protection: IProtection): Promise { + public async registerProtection(protection: Protection) { this.protections.set(protection.name, protection) let enabledProtections: { enabled: string[] } | null = null; @@ -587,6 +587,10 @@ export class Mjolnir { return list; } + public getProtection(protectionName: string): Protection | null { + return this.protections.get(protectionName) ?? null; + } + public async watchList(roomRef: string): Promise { const joinedRooms = await this.client.getJoinedRooms(); const permalink = Permalinks.parseUrl(roomRef); @@ -854,7 +858,7 @@ export class Mjolnir { } } - if (Object.keys(this.protectedRooms).includes(roomId)) { + if (roomId in this.protectedRooms) { if (event['sender'] === await this.client.getUserId()) return; // Ignore ourselves // Iterate all the enabled protections diff --git a/src/commands/CommandHandler.ts b/src/commands/CommandHandler.ts index d138958..65ea340 100644 --- a/src/commands/CommandHandler.ts +++ b/src/commands/CommandHandler.ts @@ -1,5 +1,5 @@ /* -Copyright 2019-2021 The Matrix.org Foundation C.I.C. +Copyright 2019-2022 The Matrix.org Foundation C.I.C. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -38,6 +38,7 @@ import { execShutdownRoomCommand } from "./ShutdownRoomCommand"; import { execAddAliasCommand, execMoveAliasCommand, execRemoveAliasCommand, execResolveCommand } from "./AliasCommands"; import { execKickCommand } from "./KickCommand"; + export const COMMAND_PREFIX = "!mjolnir"; export async function handleCommand(roomId: string, event: { content: { body: string } }, mjolnir: Mjolnir) { @@ -46,7 +47,7 @@ export async function handleCommand(roomId: string, event: { content: { body: st try { if (parts.length === 1 || parts[1] === 'status') { - return await execStatusCommand(roomId, event, mjolnir); + return await execStatusCommand(roomId, event, mjolnir, parts.slice(2)); } else if (parts[1] === 'ban' && parts.length > 2) { return await execBanCommand(roomId, event, mjolnir, parts); } else if (parts[1] === 'unban' && parts.length > 2) { @@ -114,6 +115,7 @@ export async function handleCommand(roomId: string, event: { content: { body: st const menu = "" + "!mjolnir - Print status information\n" + "!mjolnir status - Print status information\n" + + "!mjolnir status protection [subcommand] - Print status information for a protection\n" + "!mjolnir ban [reason] - Adds an entity to the ban list\n" + "!mjolnir unban [apply] - Removes an entity from the ban list. If apply is 'true', the users matching the glob will actually be unbanned\n" + "!mjolnir redact [room alias/ID] [limit] - Redacts messages by the sender in the target room (or all rooms), up to a maximum number of events in the backlog (default 1000)\n" + diff --git a/src/commands/StatusCommand.ts b/src/commands/StatusCommand.ts index 665fb15..ecb0871 100644 --- a/src/commands/StatusCommand.ts +++ b/src/commands/StatusCommand.ts @@ -1,5 +1,5 @@ /* -Copyright 2019, 2020 The Matrix.org Foundation C.I.C. +Copyright 2019-2022 The Matrix.org Foundation C.I.C. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,9 +16,23 @@ limitations under the License. import { Mjolnir, STATE_CHECKING_PERMISSIONS, STATE_NOT_STARTED, STATE_RUNNING, STATE_SYNCING } from "../Mjolnir"; import { RichReply } from "matrix-bot-sdk"; +import { htmlEscape } from "../utils"; // !mjolnir -export async function execStatusCommand(roomId: string, event: any, mjolnir: Mjolnir) { +export async function execStatusCommand(roomId: string, event: any, mjolnir: Mjolnir, parts: string[]) { + switch (parts[0]) { + case undefined: + case 'mjolnir': + return showMjolnirStatus(roomId, event, mjolnir); + case 'protection': + return showProtectionStatus(roomId, event, mjolnir, parts.slice(/* ["protection"] */ 1)); + default: + throw new Error(`Invalid status command: ${htmlEscape(parts[0])}`); + } +} + +async function showMjolnirStatus(roomId: string, event: any, mjolnir: Mjolnir) { + // Display the status of Mjölnir. let html = ""; let text = ""; @@ -68,3 +82,25 @@ export async function execStatusCommand(roomId: string, event: any, mjolnir: Mjo reply["msgtype"] = "m.notice"; return mjolnir.client.sendMessage(roomId, reply); } + +async function showProtectionStatus(roomId: string, event: any, mjolnir: Mjolnir, parts: string[]) { + const protectionName = parts[0]; + const protection = mjolnir.getProtection(protectionName); + let text; + let html; + if (!protection) { + text = html = "Unknown protection"; + } else { + const status = await protection.statusCommand(mjolnir, parts.slice(1)); + if (status) { + text = status.text; + html = status.html; + } else { + text = ""; + html = "<no status>"; + } + } + const reply = RichReply.createFor(roomId, event, text, html); + reply["msgtype"] = "m.notice"; + await mjolnir.client.sendMessage(roomId, reply); +} diff --git a/src/protections/DetectFederationLag.ts b/src/protections/DetectFederationLag.ts new file mode 100644 index 0000000..548bf65 --- /dev/null +++ b/src/protections/DetectFederationLag.ts @@ -0,0 +1,742 @@ +/* +Copyright 2022 The Matrix.org Foundation C.I.C. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +import { Protection } from "./IProtection"; +import { DurationMSProtectionSetting, NumberProtectionSetting, StringSetProtectionSetting } from "./ProtectionSettings"; +import { Mjolnir } from "../Mjolnir"; +import { LogLevel, UserID } from "matrix-bot-sdk"; + +const DEFAULT_BUCKET_DURATION_MS = 10_000; +const DEFAULT_BUCKET_NUMBER = 6; +const DEFAULT_CLEANUP_PERIOD_MS = 3_600 * 1_000; +const DEFAULT_INITIAL_DELAY_GRACE_MS = 180_000; +const DEFAULT_LOCAL_HOMESERVER_LAG_ENTER_WARNING_ZONE_MS = 120_000; +const DEFAULT_LOCAL_HOMESERVER_LAG_EXIT_WARNING_ZONE_MS = 100_000; +const DEFAULT_FEDERATED_HOMESERVER_LAG_ENTER_WARNING_ZONE_MS = 180_000; +const DEFAULT_FEDERATED_HOMESERVER_LAG_EXIT_WARNING_ZONE_MS = 150_000; +const DEFAULT_NUMBER_OF_LAGGING_FEDERATED_SERVERS_ENTER_WARNING_ZONE = 20; +const DEFAULT_NUMBER_OF_LAGGING_FEDERATED_SERVERS_EXIT_WARNING_ZONE = 10; +const DEFAULT_REWARN_AFTER_MS = 60_000; + +/** + * A state event emitted in the moderation room when there is lag, + * redacted when lag has disappeared. + * + * The state key is the id of the room in which lag was detected. + */ +export const LAG_STATE_EVENT = "org.mjolnir.monitoring.lag"; + +/** + * Settings for a timed histogram. + */ +type HistogramSettings = { + // The width of a bucket, in ms. + bucketDurationMS: number, + // The number of buckets. + bucketNumber: number; +} + +/** + * A histogram with time as x and some arbitrary value T as y. + */ +class TimedHistogram { + /** + * An array of at most `this.settings.bucketNumber` buckets of events. + * + * Each bucket gathers all events that were pushed during an interval of + * `this.settings.bucketDurationMS` starting at `bucket.timeStamp`. + * + * `0` is the oldest bucket. + * .. + * `length - 1` is the most recent bucket. + * + * Notes: + * - this is a sparse array, buckets are not necessarily adjacent; + * - if `this.updateSettings()` is called, we do not redistribute events + * between buckets, so it may take some time before statistics fully + * respect the new settings. + */ + protected buckets: { + start: Date; + events: T[] + }[]; + + /** + * Construct an empty TimedHistogram + */ + constructor(private settings: HistogramSettings) { + this.buckets = [] + } + + /** + * Push a new event into the histogram. + * + * New events are always considered most recent, without checking `new`. + * If pushing a new event causes the histogram to overflow, oldest buckets + * are removed. + * + * @param event The event to push. + * @param now The current date, used to create a new bucket to the event if + * necessary and to determine whether some buckets are too old. + */ + push(event: T, now: Date) { + let timeStamp = now.getTime(); + let latestBucket = this.buckets[this.buckets.length - 1]; + if (latestBucket && latestBucket.start.getTime() + this.settings.bucketDurationMS >= timeStamp) { + // We're still within `durationPerColumnMS` of latest entry, we can reuse that entry. + latestBucket.events.push(event); + return; + } + // Otherwise, initialize an entry, then prune columns that are too old. + this.buckets.push({ + start: now, + events: [event] + }); + this.trimBuckets(this.settings, now); + } + + /** + * If any buckets are too old, remove them. If there are (still) too + * many buckets, remove the oldest ones. + */ + private trimBuckets(settings: HistogramSettings, now: Date) { + if (this.buckets.length > settings.bucketNumber) { + this.buckets.splice(0, this.buckets.length - settings.bucketNumber); + } + const oldestAcceptableTimestamp = now.getTime() - settings.bucketDurationMS * settings.bucketNumber; + for (let i = this.buckets.length - 2; i >= 0; --i) { + // Find the most recent bucket that is too old. + if (this.buckets[i].start.getTime() < oldestAcceptableTimestamp) { + // ...and remove that bucket and every bucket before it. + this.buckets.splice(0, i + 1); + break; + } + } + } + + /** + * Change the settings of a histogram. + */ + public updateSettings(settings: HistogramSettings, now: Date) { + this.trimBuckets(settings, now); + this.settings = settings; + } +} + +/** + * General-purpose statistics on a sample. + */ +class Stats { + // Minimum. + public readonly min: number; + // Maximum. + public readonly max: number; + // Mean. + public readonly mean: number; + // Median. + public readonly median: number; + // Standard deviation. + public readonly stddev: number; + // Length of the sample. + public readonly length: number; + + constructor(values: number[]) { + this.length = values.length; + if (this.length === 0) { + throw new TypeError("Attempting to compute stats on an empty sample"); + } + if (this.length === 1) { + // `values[Math.ceil(this.length / 2)]` below fails when `this.length == 1`. + this.min = + this.max = + this.mean = + this.median = values[0]; + this.stddev = 0; + return; + } + values.sort((a, b) => a - b); // Don't forget to force sorting by value, not by stringified value! + this.min = values[0]; + this.max = values[this.length - 1]; + let total = 0; + for (let num of values) { + total += num; + } + this.mean = total / this.length; + + let totalVariance = 0; + for (let num of values) { + const deviation = num - this.mean; + totalVariance += deviation * deviation; + } + this.stddev = Math.sqrt(totalVariance / this.length); + + if (this.length % 2 === 0) { + this.median = values[this.length / 2]; + } else { + this.median = (values[Math.floor(this.length / 2)] + values[Math.ceil(this.length / 2)]) / 2; + } + } + + public round(): { min: number, max: number, mean: number, median: number, stddev: number, length: number } { + return { + min: Math.round(this.min), + max: Math.round(this.max), + mean: Math.round(this.mean), + median: Math.round(this.median), + stddev: Math.round(this.stddev), + length: this.length + } + } +} + +/** + * A subclass of TimedHistogram that supports only numbers + * and can compute statistics. + */ +class NumbersTimedHistogram extends TimedHistogram { + constructor(settings: HistogramSettings) { + super(settings); + } + + /** + * Compute stats. + * + * @returns `null` if the histogram is empty, otherwise `Stats`. + */ + public stats(): Stats | null { + if (this.buckets.length === 0) { + return null; + } + let numbers = []; + for (let bucket of this.buckets) { + numbers.push(...bucket.events); + } + if (numbers.length === 0) { + return null; + } + return new Stats(numbers); + } +} + +/** + * Lag information on a server for a specific room. + * + * The same server may be represented by distinct instances of `ServerInfo` in + * distinct rooms. + */ +class ServerInfo { + /** + * The histogram collecting lag, in ms. + */ + private histogram: NumbersTimedHistogram; + + /** + * Date of the latest message received from this server. + * + * May be used to clean up data structures. + */ + public latestMessage: Date = new Date(0); + public latestStatsUpdate: Date; + + constructor(settings: HistogramSettings, now: Date) { + this.histogram = new NumbersTimedHistogram(settings); + this.latestStatsUpdate = now; + } + + /** + * Record lag information on this server. + * + * @param lag The duration of lag, in ms. + */ + pushLag(lag: number, now: Date) { + this.latestMessage = now; + this.histogram.push(lag, now); + } + + updateSettings(settings: HistogramSettings, now: Date) { + this.histogram.updateSettings(settings, now); + } + + /** + * Compute stats. + * + * @returns `null` if the histogram is empty, otherwise `Stats`. + */ + stats(now?: Date) { + if (now) { + this.latestStatsUpdate = now; + } + return this.histogram.stats(); + } +} + +/** + * Thresholds to start/stop warning of an issue. + * + * Once we have hit a value higher that `enterWarningZone`, the alert + * will remain active until the value decreases below `exitWarningZone`. + */ +type WarningThresholds = { + enterWarningZone: number, + exitWarningZone: number +} + +enum AlertDiff { + Start, + Stop, + NoChange +} + +/** + * Statistics to help determine whether we should raise the alarm on lag in a room. + * + * Each individual server may have lag. + */ +class RoomInfo { + /** + * A map of domain => lag information. + */ + private serverLags: Map = new Map(); + /** + * The set of servers currently on alert. + */ + private serverAlerts: Set = new Set(); + + /** + * Global lag information for this room. + */ + public totalLag: ServerInfo; + + /** + * If non-`null`, the date at which this room started being on alert. + * Otherwise, the room is not an alert. + */ + public latestAlertStart: Date | null; + + /** + * The date at which we last issued a warning on this room. + * + * Used to avoid spamming the monitoring room with too many warnings per room. + */ + public latestWarning: Date = new Date(0); + + /** + * If non-`null`, we have issued a structured warning as a state event. + * This needs to be redacted once the alert has passed. + */ + public warnStateEventId: string | null = null; + + /** + * The date at which we last received a message in this room. + */ + public latestMessage: Date = new Date(0); + + constructor(now: Date) { + this.serverLags = new Map(); + this.totalLag = new ServerInfo({ + bucketDurationMS: DEFAULT_BUCKET_DURATION_MS, + bucketNumber: DEFAULT_BUCKET_NUMBER + }, now); + } + + /** + * Add a lag annotation. + * + * @param serverId The server from which the message was sent. Could be the local server. + * @param lag How many ms of lag was measured. Hopefully ~0. + * @param settings Settings used in case we need to create or update the histogram. + * @param thresholds The thresholds to use to determine whether an origin server is currently lagging. + * @param now Instant at which all of this was measured. + */ + pushLag(serverId: string, lag: number, settings: HistogramSettings, thresholds: WarningThresholds, now: Date = new Date()): AlertDiff { + this.latestMessage = now; + + // Update per-server lag. + let serverInfo = this.serverLags.get(serverId); + if (!serverInfo) { + serverInfo = new ServerInfo(settings, now); + this.serverLags.set(serverId, serverInfo); + } else { + serverInfo.updateSettings(settings, now); + } + serverInfo.pushLag(lag, now); + + // Update global lag. + this.totalLag.updateSettings(settings, now); + this.totalLag.pushLag(lag, now); + + // Check for alerts, if necessary. + if (serverInfo.latestStatsUpdate.getTime() + settings.bucketDurationMS > now.getTime()) { + // Too early to recompute stats. + return AlertDiff.NoChange; + } + + let stats = serverInfo.stats(now)!; + if (stats.median > thresholds.enterWarningZone) { + // Oops, we're now on alert for this server. + let previous = this.serverAlerts.has(serverId); + if (!previous) { + this.serverAlerts.add(serverId); + return AlertDiff.Start; + } + } else if (stats.median < thresholds.exitWarningZone) { + // Ah, we left the alert zone. + let previous = this.serverAlerts.has(serverId); + if (previous) { + this.serverAlerts.delete(serverId); + return AlertDiff.Stop; + } + } + return AlertDiff.NoChange; + } + + /** + * The number of servers currently on alert. + */ + public get alerts(): number { + return this.serverAlerts.size; + } + + /** + * The current global stats. + * + * These stats are not separated by remote server. + * + * @returns null if we have no recent data at all, + * some stats otherwise. + */ + public globalStats(): Stats | null { + return this.totalLag.stats(); + } + + /** + * Check if a server is currently marked as lagging. + * + * A server is marked as lagging if its mean lag has exceeded + * `threshold.enterWarningZone` and has not decreased below + * `threshold.exitWarningZone`. + * + * @returns `true` is that server is currently on alert. + */ + public isServerOnAlert(serverId: string): boolean { + return this.serverAlerts.has(serverId); + } + + /** + * The list of servers currently on alert. + */ + public serversOnAlert(): IterableIterator { + return this.serverAlerts.keys(); + } + + public cleanup(settings: HistogramSettings, now: Date, oldest: Date) { + // Cleanup global histogram. + // + // If `oldest == now - settings.duration * settings.number`, this + // should correspond exactly to the cleanup that takes place within + // `this.serverLags`. There is a risk of inconsistency between data + // if this is not the case. + // + // We assume that this is an acceptable risk: as we regularly + // erase oldest data from both `this.totalLag` and individual + // entries of `this.serverLags`, both sets of data will eventually + // catch up with each other. + this.totalLag.updateSettings(settings, now); + let serverLagsDeleteIds = []; + for (let [serverId, serverStats] of this.serverLags) { + if (serverStats.latestMessage < oldest) { + // Remove entire histogram. + serverLagsDeleteIds.push(serverId); + continue; + } + // Cleanup histogram. + serverStats.updateSettings(settings, now); + } + for (let key of serverLagsDeleteIds) { + this.serverLags.delete(key); + this.serverAlerts.delete(key); + // Note that we remove the alert to save memory (it's not really useful + // to keep monitoring a server for too long after receiving a message) + // but this does NOT guaranteed that server lag is over. It may be that + // the server is down or that the server is lagging by more than ~1h + // (by default). + } + } +} + +export class DetectFederationLag extends Protection { + /** + * For each room we're monitoring, lag information. + */ + lagPerRoom: Map = new Map(); + public settings = { + // Rooms to ignore. + ignoreRooms: new StringSetProtectionSetting(), + // Servers to ignore, typically because they're known to be slow. + ignoreServers: new StringSetProtectionSetting(), + // How often we should recompute lag. + bucketDuration: new DurationMSProtectionSetting(DEFAULT_BUCKET_DURATION_MS, 100), + // How long we should remember lag in a room (`bucketDuration * bucketNumber` ms). + bucketNumber: new NumberProtectionSetting(DEFAULT_BUCKET_NUMBER, 1), + // How much lag before the local homeserver is considered lagging. + localHomeserverLagEnterWarningZone: new DurationMSProtectionSetting(DEFAULT_LOCAL_HOMESERVER_LAG_ENTER_WARNING_ZONE_MS, 1), + // How much lag before the local homeserver is considered not lagging anymore. + localHomeserverLagExitWarningZone: new DurationMSProtectionSetting(DEFAULT_LOCAL_HOMESERVER_LAG_EXIT_WARNING_ZONE_MS, 1), + // How much lag before a federated homeserver is considered lagging. + federatedHomeserverLagEnterWarningZone: new DurationMSProtectionSetting(DEFAULT_FEDERATED_HOMESERVER_LAG_ENTER_WARNING_ZONE_MS, 1), + // How much lag before a federated homeserver is considered not lagging anymore. + federatedHomeserverLagExitWarningZone: new DurationMSProtectionSetting(DEFAULT_FEDERATED_HOMESERVER_LAG_EXIT_WARNING_ZONE_MS, 1), + // How much time we should wait before printing a new warning. + warnAgainAfter: new DurationMSProtectionSetting(DEFAULT_REWARN_AFTER_MS, 1), + // How many federated homeservers it takes to trigger an alert. + // You probably want to update this if you're monitoring a room that + // has many underpowered homeservers. + numberOfLaggingFederatedHomeserversEnterWarningZone: new NumberProtectionSetting(DEFAULT_NUMBER_OF_LAGGING_FEDERATED_SERVERS_ENTER_WARNING_ZONE, 1), + // How many federated homeservers it takes before we're considered not on alert anymore. + // You probably want to update this if you're monitoring a room that + // has many underpowered homeservers. + numberOfLaggingFederatedHomeserversExitWarningZone: new NumberProtectionSetting(DEFAULT_NUMBER_OF_LAGGING_FEDERATED_SERVERS_EXIT_WARNING_ZONE, 1), + // How long to wait before actually collecting statistics. + // Used to avoid being misled by Mjölnir catching up with old messages on first sync. + initialDelayGrace: new DurationMSProtectionSetting(DEFAULT_INITIAL_DELAY_GRACE_MS, 0), + cleanupPeriod: new DurationMSProtectionSetting(DEFAULT_CLEANUP_PERIOD_MS, 1), + }; + // The instant at which the first message was received. + private firstMessage: Date | null = null; + // The latest instant at which we have started cleaning up old data. + private latestCleanup: Date = new Date(0); + private latestHistogramSettings: HistogramSettings; + constructor() { + super(); + // Initialize and watch `this.latestHistogramSettings`. + this.updateLatestHistogramSettings(); + this.settings.bucketDuration.on("set", () => this.updateLatestHistogramSettings()); + this.settings.bucketNumber.on("set", () => this.updateLatestHistogramSettings()); + } + dispose() { + this.settings.bucketDuration.removeAllListeners(); + this.settings.bucketNumber.removeAllListeners(); + } + public get name(): string { + return 'DetectFederationLag'; + } + public get description(): string { + return `Warn moderators if either the local homeserver starts lagging by ${this.settings.localHomeserverLagEnterWarningZone.value}ms or at least ${this.settings.numberOfLaggingFederatedHomeserversEnterWarningZone.value} start lagging by at least ${this.settings.federatedHomeserverLagEnterWarningZone.value}ms.`; + } + + /** + * @param now An argument used only by tests, to simulate events taking place at a specific date. + */ + public async handleEvent(mjolnir: Mjolnir, roomId: string, event: any, now: Date = new Date()) { + // First, handle all cases in which we should ignore the event. + if (!this.firstMessage) { + this.firstMessage = now; + } + if (this.firstMessage.getTime() + this.settings.initialDelayGrace.value > now.getTime()) { + // We're still in the initial grace period, ignore. + return; + } + if (this.latestCleanup.getTime() + this.settings.cleanupPeriod.value > now.getTime()) { + // We should run some cleanup. + this.latestCleanup = now; + this.cleanup(now); + } + if (this.settings.ignoreRooms.value.has(roomId)) { + // Room is ignored. + return; + } + const sender = event['sender'] as string; + if (typeof sender !== "string") { + // Ill-formed event. + return; + } + if (sender === await mjolnir.client.getUserId()) { + // Let's not create loops. + return; + } + const domain = new UserID(sender).domain; + if (!domain) { + // Ill-formed event. + return; + } + + const origin = event['origin_server_ts'] as number; + if (typeof origin !== "number" || isNaN(origin)) { + // Ill-formed event. + return; + } + const delay = now.getTime() - origin; + if (delay < 0) { + // Could be an ill-formed event. + // Could be non-motonic clocks or other time shennanigans. + return; + } + + let roomInfo = this.lagPerRoom.get(roomId); + if (!roomInfo) { + roomInfo = new RoomInfo(now); + this.lagPerRoom.set(roomId, roomInfo); + } + + const localDomain = new UserID(await mjolnir.client.getUserId()).domain + const isLocalDomain = domain === localDomain; + const thresholds = + isLocalDomain + ? { + enterWarningZone: this.settings.localHomeserverLagEnterWarningZone.value, + exitWarningZone: this.settings.localHomeserverLagExitWarningZone.value, + } + : { + enterWarningZone: this.settings.federatedHomeserverLagEnterWarningZone.value, + exitWarningZone: this.settings.federatedHomeserverLagExitWarningZone.value, + }; + + const diff = roomInfo.pushLag(domain, delay, this.latestHistogramSettings, thresholds, now); + if (diff === AlertDiff.NoChange) { + return; + } + + if (roomInfo.latestWarning.getTime() + this.settings.warnAgainAfter.value > now.getTime()) { + if (!isLocalDomain || diff !== AlertDiff.Start) { + // No need to check for alarms, we have raised an alarm recently. + return; + } + } + + // Check whether an alarm needs to be raised! + const isLocalDomainOnAlert = roomInfo.isServerOnAlert(localDomain); + if (roomInfo.alerts > this.settings.numberOfLaggingFederatedHomeserversEnterWarningZone.value + || isLocalDomainOnAlert) { + // Raise the alarm! + if (!roomInfo.latestAlertStart) { + roomInfo.latestAlertStart = now; + } + roomInfo.latestAlertStart = now; + // Background-send message. + const stats = roomInfo.globalStats(); + /* do not await */ mjolnir.logMessage(LogLevel.WARN, "FederationLag", + `Room ${roomId} is experiencing ${isLocalDomainOnAlert ? "LOCAL" : "federated"} lag since ${roomInfo.latestAlertStart}.\n${roomInfo.alerts} homeservers are lagging: ${[...roomInfo.serversOnAlert()].sort()} .\nRoom lag statistics: ${JSON.stringify(stats, null, 2)}.`); + // Drop a state event, for the use of potential other bots. + const warnStateEventId = await mjolnir.client.sendStateEvent(mjolnir.managementRoomId, LAG_STATE_EVENT, roomId, { + domains: [...roomInfo.serversOnAlert()], + roomId, + // We need to round the stats, as Matrix doesn't support floating-point + // numbers in messages. + stats: stats?.round(), + since: roomInfo.latestAlertStart, + }); + roomInfo.warnStateEventId = warnStateEventId; + } else if (roomInfo.alerts < this.settings.numberOfLaggingFederatedHomeserversExitWarningZone.value + || !isLocalDomainOnAlert) { + // Stop the alarm! + /* do not await */ mjolnir.logMessage(LogLevel.INFO, "FederationLag", + `Room ${roomId} lag has decreased to an acceptable level. Currently, ${roomInfo.alerts} homeservers are still lagging` + ); + if (roomInfo.warnStateEventId) { + const warnStateEventId = roomInfo.warnStateEventId; + roomInfo.warnStateEventId = null; + await mjolnir.client.redactEvent(mjolnir.managementRoomId, warnStateEventId, "Alert over"); + } + } + } + + /** + * Run cleanup on data structures, to save memory. + * + * @param now Now. + * @param oldest Prune any data older than `oldest`. + */ + public async cleanup(now: Date = new Date()) { + const oldest: Date = this.getOldestAcceptableData(now); + const lagPerRoomDeleteIds = []; + for (const [roomId, roomInfo] of this.lagPerRoom) { + if (roomInfo.latestMessage < oldest) { + // We need to remove the entire room. + lagPerRoomDeleteIds.push(roomId); + continue; + } + // Clean room stats. + roomInfo.cleanup(this.latestHistogramSettings, now, oldest); + } + for (const roomId of lagPerRoomDeleteIds) { + this.lagPerRoom.delete(roomId); + } + } + + private getOldestAcceptableData(now: Date): Date { + return new Date(now.getTime() - this.latestHistogramSettings.bucketDurationMS * this.latestHistogramSettings.bucketNumber) + } + private updateLatestHistogramSettings() { + this.latestHistogramSettings = Object.freeze({ + bucketDurationMS: this.settings.bucketDuration.value, + bucketNumber: this.settings.bucketNumber.value, + }); + }; + + /** + * Return (mostly) human-readable lag status. + */ + public async statusCommand(mjolnir: Mjolnir, subcommand: string[]): Promise<{html: string, text: string} | null> { + const roomId = subcommand[0] || "*"; + const localDomain = new UserID(await mjolnir.client.getUserId()).domain; + const annotatedStats = (roomInfo: RoomInfo) => { + const stats = roomInfo.globalStats()?.round(); + if (!stats) { + return null; + } + const isLocalDomainOnAlert = roomInfo.isServerOnAlert(localDomain); + const numberOfServersOnAlert = roomInfo.alerts; + if (isLocalDomainOnAlert) { + (stats as any)["warning"] = "Local homeserver is lagging"; + } else if (numberOfServersOnAlert > this.settings.numberOfLaggingFederatedHomeserversEnterWarningZone.value) { + (stats as any)["warning"] = `${numberOfServersOnAlert} homeservers are lagging`; + } + return stats; + }; + let text; + let html; + if (roomId === "*") { + // Collate data from all protected rooms. + const result: any = {}; + + for (const [perRoomId, perRoomInfo] of this.lagPerRoom.entries()) { + const key = await mjolnir.client.getPublishedAlias(perRoomId) || perRoomId; + result[key] = annotatedStats(perRoomInfo); + } + text = JSON.stringify(result, null, 2); + html = `${JSON.stringify(result, null, "  ")}`; + } else { + // Fetch data from a specific room. + const roomInfo = this.lagPerRoom.get(roomId); + if (!roomInfo) { + html = text = `Either ${roomId} is unmonitored or it has received no messages in a while`; + } else { + // Fetch data from all remote homeservers. + const stats = annotatedStats(roomInfo); + if (!stats) { + html = text = `No recent messages in room ${roomId}`; + } else { + text = JSON.stringify(stats, null, 2); + html = `${JSON.stringify(stats, null, "  ")}`; + } + } + } + return { + text, + html + } + } +} diff --git a/src/protections/IProtection.ts b/src/protections/IProtection.ts index 129098f..b8a8f1b 100644 --- a/src/protections/IProtection.ts +++ b/src/protections/IProtection.ts @@ -23,31 +23,34 @@ import { AbstractProtectionSetting } from "./ProtectionSettings"; * * Protections are guaranteed to be run before redaction handlers. */ -export interface IProtection { - readonly name: string; - readonly description: string; - enabled: boolean; - settings: { [setting: string]: AbstractProtectionSetting }; - /* - * Handle a single event from a protected room, to decide if we need to - * respond to it - */ - handleEvent(mjolnir: Mjolnir, roomId: string, event: any): Promise; - /* - * Handle a single reported event from a protecte room, to decide if we - * need to respond to it - */ - handleReport(mjolnir: Mjolnir, roomId: string, reporterId: string, reason: string, event: any): Promise; -} -export abstract class Protection implements IProtection { +export abstract class Protection { abstract readonly name: string abstract readonly description: string; enabled = false; abstract settings: { [setting: string]: AbstractProtectionSetting }; + + + /* + * Handle a single event from a protected room, to decide if we need to + * respond to it + */ handleEvent(mjolnir: Mjolnir, roomId: string, event: any): Promise { return Promise.resolve(null); } + + /* + * Handle a single reported event from a protecte room, to decide if we + * need to respond to it + */ handleReport(mjolnir: Mjolnir, roomId: string, reporterId: string, event: any, reason?: string): Promise { return Promise.resolve(null); } + + /** + * Return status information for `!mjolnir status ${protectionName}`. + */ + async statusCommand(mjolnir: Mjolnir, subcommand: string[]): Promise<{html: string, text: string} | null> { + // By default, protections don't have any status to show. + return null; + } } diff --git a/src/protections/ProtectionSettings.ts b/src/protections/ProtectionSettings.ts index b6653ec..f8f2f37 100644 --- a/src/protections/ProtectionSettings.ts +++ b/src/protections/ProtectionSettings.ts @@ -14,13 +14,24 @@ See the License for the specific language governing permissions and limitations under the License. */ +import { EventEmitter } from "events"; +import { default as parseDuration } from "parse-duration"; + +// Define a few aliases to simplify parsing durations. + +parseDuration["milliseconds"] = parseDuration["millis"] = parseDuration["ms"]; +parseDuration["days"] = parseDuration["day"]; +parseDuration["weeks"] = parseDuration["week"] = parseDuration["wk"]; +parseDuration["months"] = parseDuration["month"]; +parseDuration["years"] = parseDuration["year"]; + export class ProtectionSettingValidationError extends Error {}; /* * @param TChange Type for individual pieces of data (e.g. `string`) * @param TValue Type for overall value of this setting (e.g. `string[]`) */ -export class AbstractProtectionSetting { +export class AbstractProtectionSetting extends EventEmitter { // the current value of this setting value: TValue @@ -50,6 +61,7 @@ export class AbstractProtectionSetting { */ setValue(data: TValue) { this.value = data; + this.emit("set", data); } } export class AbstractProtectionListSetting extends AbstractProtectionSetting { @@ -88,10 +100,30 @@ export class StringListProtectionSetting extends AbstractProtectionListSetting data; validate = (data: string): boolean => true; addValue(data: string): string[] { - return [...this.value, data]; + this.emit("add", data); + this.value.push(data); + return this.value; } removeValue(data: string): string[] { - return this.value.filter(i => i !== data); + this.emit("remove", data); + this.value = this.value.filter(i => i !== data); + return this.value; + } +} + +export class StringSetProtectionSetting extends AbstractProtectionListSetting> { + value: Set = new Set(); + fromString = (data: string): string => data; + validate = (data: string): boolean => true; + addValue(data: string): Set { + this.emit("add", data); + this.value.add(data); + return this.value; + } + removeValue(data: string): Set { + this.emit("remove", data); + this.value.delete(data); + return this.value; } } @@ -125,5 +157,30 @@ export class NumberProtectionSetting extends AbstractProtectionSetting { + constructor( + defaultValue: number, + public readonly minMS: number|undefined = undefined, + public readonly maxMS: number|undefined = undefined + ) { + super(); + this.setValue(defaultValue); + } + + fromString(data: string) { + let number = parseDuration(data); + return isNaN(number) ? undefined : number; + } + validate(data: number) { + return (!isNaN(data) + && (this.minMS === undefined || this.minMS <= data) + && (this.maxMS === undefined || data <= this.maxMS)) + } } diff --git a/src/protections/protections.ts b/src/protections/protections.ts index 624c6c3..61aa513 100644 --- a/src/protections/protections.ts +++ b/src/protections/protections.ts @@ -15,18 +15,20 @@ limitations under the License. */ import { FirstMessageIsImage } from "./FirstMessageIsImage"; -import { IProtection } from "./IProtection"; +import { Protection } from "./IProtection"; import { BasicFlooding } from "./BasicFlooding"; +import { DetectFederationLag } from "./DetectFederationLag"; import { WordList } from "./WordList"; import { MessageIsVoice } from "./MessageIsVoice"; import { MessageIsMedia } from "./MessageIsMedia"; import { TrustedReporters } from "./TrustedReporters"; -export const PROTECTIONS: IProtection[] = [ +export const PROTECTIONS: Protection[] = [ new FirstMessageIsImage(), new BasicFlooding(), new WordList(), new MessageIsVoice(), new MessageIsMedia(), - new TrustedReporters() + new TrustedReporters(), + new DetectFederationLag(), ]; diff --git a/test/integration/detectFederationLagTest.ts b/test/integration/detectFederationLagTest.ts new file mode 100644 index 0000000..e679f23 --- /dev/null +++ b/test/integration/detectFederationLagTest.ts @@ -0,0 +1,266 @@ +import { strict as assert } from "assert"; + +import { UserID } from "matrix-bot-sdk"; +import { Suite } from "mocha"; +import { Mjolnir } from "../../src/Mjolnir"; +import { DetectFederationLag, LAG_STATE_EVENT } from "../../src/protections/DetectFederationLag"; +import { getFirstReply } from "./commands/commandUtils"; +import { newTestUser } from "./clientHelper"; + +const LOCAL_HOMESERVER_LAG_ENTER_WARNING_ZONE_MS = 180_000; +const LOCAL_HOMESERVER_LAG_EXIT_WARNING_ZONE_MS = 100_000; +const FEDERATED_HOMESERVER_LAG_ENTER_WARNING_ZONE_MS = 300_000; +const FEDERATED_HOMESERVER_LAG_EXIT_WARNING_ZONE_MS = 200_000; +const BUCKET_DURATION_MS = 100; +const SAMPLE_SIZE = 100; +const NUMBER_OF_LAGGING_FEDERATED_HOMESERVERS_ENTER_WARNING_ZONE = 2; + +const RE_STATS = /(\{(:?.|\n)*\})[^}]*$/m; + +describe("Test: DetectFederationLag protection", function() { + // In this entire test, we call `handleEvent` directly, injecting + // - events that simulate lag; + // - a progression through time, to make sure that histograms get processed. + beforeEach(async function() { + // Setup an instance of DetectFederationLag + this.detector = new DetectFederationLag(); + await this.mjolnir.registerProtection(this.detector); + await this.mjolnir.enableProtection("DetectFederationLag"); + + // Setup a moderator. + this.moderator = await newTestUser({ name: { contains: "moderator" } }); + await this.moderator.joinRoom(this.mjolnir.managementRoomId); + + const SETTINGS = { + // The protection should kick in immediately. + initialDelayGrace: 0, + // Make histograms progress quickly. + bucketDuration: BUCKET_DURATION_MS, + // Three homeservers should be sufficient to raise an alert. + numberOfLaggingFederatedHomeserversEnterWarningZone: NUMBER_OF_LAGGING_FEDERATED_HOMESERVERS_ENTER_WARNING_ZONE, + + localHomeserverLagEnterWarningZone: LOCAL_HOMESERVER_LAG_ENTER_WARNING_ZONE_MS, + localHomeserverLagExitWarningZone: LOCAL_HOMESERVER_LAG_EXIT_WARNING_ZONE_MS, + + federatedHomeserverLagEnterWarningZone: FEDERATED_HOMESERVER_LAG_ENTER_WARNING_ZONE_MS, + federatedHomeserverLagExitWarningZone: FEDERATED_HOMESERVER_LAG_EXIT_WARNING_ZONE_MS, + }; + for (let key of Object.keys(SETTINGS)) { + this.detector.settings[key].setValue(SETTINGS[key]); + } + this.localDomain = new UserID(await this.mjolnir.client.getUserId()).domain; + this.protectedRoomId = `!room1:${this.localDomain}`; + this.mjolnir.addProtectedRoom(this.protectedRoomId); + + this.simulateLag = async (senders: string[], lag: number, start: Date) => { + const content = {}; + const origin_server_ts = start.getTime() - lag; + for (let i = 0; i < SAMPLE_SIZE; ++i) { + // We call directly `this.detector.handleEvent` to be able to forge old values of `origin_server_ts`. + await this.detector.handleEvent(this.mjolnir, this.protectedRoomId, { + sender: senders[i % senders.length], + origin_server_ts, + content, + }, + // Make sure that time progresses through histogram buckets. + simulateDate(start, i) + ); + } + }; + + this.getAlertEvent = async () => { + try { + let event = await this.mjolnir.client.getRoomStateEvent(this.mjolnir.managementRoomId, LAG_STATE_EVENT, this.protectedRoomId); + if (Object.keys(event).length == 0) { + // Event was redacted. + return null; + } + return event; + } catch (ex) { + // No such event. + return null; + } + }; + + this.getCommandStatus = async () => { + const protectedRoomReply = await getFirstReply(this.mjolnir.client, this.mjolnir.managementRoomId, () => { + const command = `!mjolnir status protection DetectFederationLag ${this.protectedRoomId}`; + return this.moderator.sendMessage(this.mjolnir.managementRoomId, { msgtype: 'm.text', body: command }); + }); + const globalReply = await getFirstReply(this.mjolnir.client, this.mjolnir.managementRoomId, () => { + const command = `!mjolnir status protection DetectFederationLag *`; + return this.moderator.sendMessage(this.mjolnir.managementRoomId, { msgtype: 'm.text', body: command }); + }); + const protectedRoomStatsStr = protectedRoomReply.content.body.match(RE_STATS)[0]; + const globalStatsStr = globalReply.content.body.match(RE_STATS)[0]; + return { + protectedRoomStats: protectedRoomStatsStr ? JSON.parse(protectedRoomStatsStr) : null, + globalStats: globalStatsStr ? JSON.parse(globalStatsStr) : null, + } + } + }); + + afterEach(async function() { + await this.detector.cleanup(); + this.detector.dispose(); + await this.moderator?.stop(); + }); + + let simulateDate = (start: Date, progress: number = SAMPLE_SIZE) => + new Date(start.getTime() + 2 * progress * BUCKET_DURATION_MS / SAMPLE_SIZE); + + it('DetectFederationLag doesn\'t detect lag when there isn\'t any', async function() { + this.timeout(60000); + const MULTIPLIERS = [0, 0.5, 0.9]; + + // In this test, all the events we send have a lag < {local, federated}HomeserverLagEnterWarningZoneMS. + const start = new Date(); + + // Ensure that no alert has been emitted yet. + assert.equal(await this.getAlertEvent(), null, "Initially, there should be no alert"); + + // First, let's send events from the local homeserver. + const LOCAL_SENDERS = [`@local_user:${this.localDomain}`]; + for (let multiplier of MULTIPLIERS) { + const LAG = multiplier * LOCAL_HOMESERVER_LAG_ENTER_WARNING_ZONE_MS; + await this.simulateLag(LOCAL_SENDERS, LAG, start); + assert.equal(await this.getAlertEvent(), null, `We have sent lots of local pseudo-events with a small lag of ${LAG}, there should be NO alert`); + } + + // Three distinct remote servers should be sufficient to trigger an alert, if they all lag. + const REMOTE_SENDERS = [ + "@user2:left.example.com", + "@user3:right.example.com", + "@user4:middle.example.com", + ]; + for (let multiplier of MULTIPLIERS) { + const LAG = multiplier * FEDERATED_HOMESERVER_LAG_ENTER_WARNING_ZONE_MS; + await this.simulateLag(REMOTE_SENDERS, LAG, start); + assert.equal(await this.getAlertEvent(), null, `We have sent lots of remote pseudo-events with a small lag of ${LAG}, there should be NO alert`); + } + + const {protectedRoomStats, globalStats} = await this.getCommandStatus(); + assert.ok(protectedRoomStats, "We should see stats for our room"); + assert.ok(protectedRoomStats.min >= 0, `min ${protectedRoomStats.min} >= 0`); + assert.ok(protectedRoomStats.min < protectedRoomStats.max); + assert.ok(protectedRoomStats.mean > 0); + assert.ok(protectedRoomStats.mean < protectedRoomStats.max); + assert.ok(protectedRoomStats.median < protectedRoomStats.max); + assert.ok(protectedRoomStats.median > 0); + assert.ok(protectedRoomStats.max >= MULTIPLIERS[MULTIPLIERS.length - 1] * FEDERATED_HOMESERVER_LAG_ENTER_WARNING_ZONE_MS); + assert.ok(protectedRoomStats.max < FEDERATED_HOMESERVER_LAG_ENTER_WARNING_ZONE_MS); + assert.deepEqual(globalStats, { [this.protectedRoomId]: protectedRoomStats }); + }); + + it('DetectFederationLag detects lag on local homeserver', async function() { + this.timeout(60000); + // In this test, all the events we send have a lag > localHomeserverLagEnterWarningZoneMS. + const start = new Date(); + const stop = simulateDate(start); + + // Ensure that no alert has been emitted yet. + assert.equal(await this.getAlertEvent(), null, "Initially, there should be no alert"); + + // Simulate lagging events from the local homeserver. This should trigger an alarm. + const SENDERS = [`@local_user_1:${this.localDomain}`]; + await this.simulateLag(SENDERS, 1.5 * LOCAL_HOMESERVER_LAG_ENTER_WARNING_ZONE_MS, start); + + let lagEvent = await this.getAlertEvent(); + console.debug(lagEvent); + + assert(lagEvent, "Local lag should be reported"); + assert.equal(JSON.stringify(lagEvent.domains), JSON.stringify([this.localDomain]), "Lag report should mention only the local domain"); + assert.equal(lagEvent.roomId, this.protectedRoomId, "Lag report should mention the right room"); + assert(new Date(lagEvent.since) >= start, "Lag report should have happened since `now`"); + assert(new Date(lagEvent.since) < stop, "Lag should have been detected before the end of the bombardment"); + + { + const {protectedRoomStats, globalStats} = await this.getCommandStatus(); + assert.ok(protectedRoomStats, "We should see stats for our room"); + assert.ok(protectedRoomStats.min >= 0, `min ${protectedRoomStats.min} >= 0`); + assert.ok(protectedRoomStats.min < protectedRoomStats.max); + assert.ok(protectedRoomStats.mean > 0); + assert.ok(protectedRoomStats.mean < protectedRoomStats.max); + assert.ok(protectedRoomStats.median < protectedRoomStats.max); + assert.ok(protectedRoomStats.median > 0); + assert.ok(protectedRoomStats.max >= 1.5 * LOCAL_HOMESERVER_LAG_ENTER_WARNING_ZONE_MS); + assert.deepEqual(globalStats, { [this.protectedRoomId]: protectedRoomStats }) + } + + // Simulate non-lagging events from the local homeserver. After a while, this should rescind the alarm. + // We switch to a new (pseudo-)user to simplify reading logs. + const SENDERS_2 = [`@local_user_2:${this.localDomain}`]; + const start2 = new Date(stop.getTime() + 1_000); + await this.simulateLag(SENDERS_2, 0.75 * LOCAL_HOMESERVER_LAG_EXIT_WARNING_ZONE_MS, start2); + + assert.equal(await this.getAlertEvent(), null, "The alert should now be rescinded"); + + { + const {protectedRoomStats, globalStats} = await this.getCommandStatus(); + assert.ok(protectedRoomStats, "We should see stats for our room"); + assert.ok(protectedRoomStats.min >= 0, `min ${protectedRoomStats.min} >= 0`); + assert.ok(protectedRoomStats.min < protectedRoomStats.max); + assert.ok(protectedRoomStats.mean > 0); + assert.ok(protectedRoomStats.mean < protectedRoomStats.max); + assert.ok(protectedRoomStats.median < protectedRoomStats.max); + assert.ok(protectedRoomStats.median > 0); + assert.ok(protectedRoomStats.max >= 0.75 * LOCAL_HOMESERVER_LAG_EXIT_WARNING_ZONE_MS); + assert.ok(protectedRoomStats.max < FEDERATED_HOMESERVER_LAG_EXIT_WARNING_ZONE_MS); + assert.deepEqual(globalStats, { [this.protectedRoomId]: protectedRoomStats }) + } + }); + + it('DetectFederationLag doesn\'t report lag when only one federated homeserver lags', async function() { + this.timeout(60000); + // In this test, all the events we send have a lag > federatedHomeserverLagEnterWarningZoneMS. + const start = new Date(); + + // Ensure that no alert has been emitted yet. + assert.equal(await this.getAlertEvent(), null, "Initially, there should be no alert"); + + // First, let's send events from the local homeserver. + const SENDERS = ["@left:left.example.com"]; + await this.simulateLag(SENDERS, 1.5 * FEDERATED_HOMESERVER_LAG_ENTER_WARNING_ZONE_MS, start); + + let lagEvent = await this.getAlertEvent(); + assert.equal(lagEvent, null, "With only one federated homeserver lagging, we shouldn't report any lag"); + }); + + it('DetectFederationLag reports lag when three federated homeservers lag', async function() { + this.timeout(60000); + // In this test, all the events we send have a lag > federatedHomeserverLagEnterWarningZoneMS. + const start = new Date(); + const stop = simulateDate(start); + + // Ensure that no alert has been emitted yet. + assert.equal(await this.getAlertEvent(), null, "Initially, there should be no alert"); + + // Simulate lagging events from remote homeservers. This should trigger an alarm. + const SENDERS = [ + "@left:left.example.com", + "@middle:middle.example.com", + "@right:right.example.com", + ]; + await this.simulateLag(SENDERS, 1.5 * FEDERATED_HOMESERVER_LAG_ENTER_WARNING_ZONE_MS, start); + + let lagEvent = await this.getAlertEvent(); + console.debug(lagEvent); + assert(lagEvent, "Local lag should be reported"); + assert.equal(JSON.stringify(lagEvent.domains.sort()), JSON.stringify(["left.example.com", "middle.example.com", "right.example.com"]), "Lag report should mention only the local domain"); + assert.equal(lagEvent.roomId, this.protectedRoomId, "Lag report should mention the right room"); + assert(new Date(lagEvent.since) >= start, "Lag report should have happened since `now`"); + assert(new Date(lagEvent.since) < stop, "Lag should have been detected before the end of the bombardment"); + + // Simulate non-lagging events from remote homeservers. After a while, this should rescind the alarm. + // We switch to new (pseudo-)users to simplify reading logs. + const SENDERS_2 = [ + "@left_2:left.example.com", + "@middle_2:middle.example.com", + "@right_2:right.example.com", + ]; + const start2 = new Date(stop.getTime() + 1_000); + await this.simulateLag(SENDERS_2, 0.75 * FEDERATED_HOMESERVER_LAG_EXIT_WARNING_ZONE_MS, start2); + + assert.equal(await this.getAlertEvent(), null, "The alert should now be rescinded"); + }); +}); diff --git a/test/integration/fixtures.ts b/test/integration/fixtures.ts index e50696d..a81d6f6 100644 --- a/test/integration/fixtures.ts +++ b/test/integration/fixtures.ts @@ -9,6 +9,7 @@ import { makeMjolnir, teardownManagementRoom } from "./mjolnirSetupUtils"; export const mochaHooks = { beforeEach: [ async function() { + console.error("---- entering test", JSON.stringify(this.currentTest.title)); // Makes MatrixClient error logs a bit easier to parse. console.log("mochaHooks.beforeEach"); // Sometimes it takes a little longer to register users. this.timeout(10000) @@ -33,6 +34,7 @@ export const mochaHooks = { ]); // remove alias from management room and leave it. await teardownManagementRoom(this.mjolnir.client, this.mjolnir.managementRoomId, config.managementRoom); + console.error("---- completed test", JSON.stringify(this.currentTest.title), "\n\n"); // Makes MatrixClient error logs a bit easier to parse. } ] }; diff --git a/yarn.lock b/yarn.lock index e517c4f..427e54f 100644 --- a/yarn.lock +++ b/yarn.lock @@ -91,6 +91,11 @@ "@types/connect" "*" "@types/node" "*" +"@types/config@0.0.41": + version "0.0.41" + resolved "https://registry.yarnpkg.com/@types/config/-/config-0.0.41.tgz#c8237ab09730380801f3643beaefa077ca5f3c28" + integrity sha512-HjXUmIld0gwvyG8MU/17QtLzOyuMX4jbGuijmS9sWsob5xxgZ/hY9cbRCaHIHqTQ3HMLhwS3F8uXq3Bt9zgzHA== + "@types/connect@*": version "3.4.35" resolved "https://registry.yarnpkg.com/@types/connect/-/connect-3.4.35.tgz#5fcf6ae445e4021d1fc2219a4873cc73a3bb2ad1" @@ -122,6 +127,11 @@ "@types/qs" "*" "@types/serve-static" "*" +"@types/html-to-text@^8.0.1": + version "8.0.1" + resolved "https://registry.yarnpkg.com/@types/html-to-text/-/html-to-text-8.0.1.tgz#e449513df2283b1adedc85bdc2f6b7187f32972a" + integrity sha512-0B/OifmJYmk5r9z9+KJtGWOF0LEjbTN4D2QeCh+mAw81JkJwC83NvNWUZFEqRT5PpnjX7vX0ab1SMGcwCs3Lag== + "@types/istanbul-lib-coverage@*", "@types/istanbul-lib-coverage@^2.0.0": version "2.0.3" resolved "https://registry.yarnpkg.com/@types/istanbul-lib-coverage/-/istanbul-lib-coverage-2.0.3.tgz#4ba8ddb720221f432e443bd5f9117fd22cfd4762" @@ -827,7 +837,7 @@ escalade@^3.1.1: resolved "https://registry.yarnpkg.com/escalade/-/escalade-3.1.1.tgz#d8cfdc7000965c5a0174b4a82eaa5c0552742e40" integrity sha512-k0er2gUkLf8O0zKJiAhmkTnJlTvINGv7ygDNPbeIsX/TJjGJZHuh9B2UxbsaEkmlEo9MfhrSzmhIlhRlI2GXnw== -escape-html@^1.0.3, escape-html@~1.0.3: +escape-html@~1.0.3: version "1.0.3" resolved "https://registry.yarnpkg.com/escape-html/-/escape-html-1.0.3.tgz#0258eae4d3d0c0974de1c169188ef0051d1d1988" integrity sha1-Aljq5NPQwJdN4cFpGI7wBR0dGYg= @@ -2005,6 +2015,11 @@ parent-module@^1.0.0: dependencies: callsites "^3.0.0" +parse-duration@^1.0.2: + version "1.0.2" + resolved "https://registry.yarnpkg.com/parse-duration/-/parse-duration-1.0.2.tgz#b9aa7d3a1363cc7e8845bea8fd3baf8a11df5805" + integrity sha512-Dg27N6mfok+ow1a2rj/nRjtCfaKrHUZV2SJpEn/s8GaVUSlf4GGRCRP1c13Hj+wfPKVMrFDqLMLITkYKgKxyyg== + parse-srcset@^1.0.2: version "1.0.2" resolved "https://registry.yarnpkg.com/parse-srcset/-/parse-srcset-1.0.2.tgz#f2bd221f6cc970a938d88556abc589caaaa2bde1"