Very basic support for OpenMetrics (aka Prometheus) (#442)

This PR:

- creates an OpenMetrics server that enables collecting performance data from this process by e.g. a Prometheus server;
- exposes as metrics the performance of http requests with MatrixBot.

Further metrics may of course be added.
This commit is contained in:
David Teller 2023-01-05 08:37:54 +01:00 committed by GitHub
parent 5824539449
commit c3cb22bf36
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 342 additions and 10 deletions

View File

@ -177,6 +177,19 @@ health:
# Defaults to 418.
unhealthyStatus: 418
openMetrics:
# Whether openMetrics should be enabled (default false, activated for tests)
enabled: true
# The port to expose the webserver on. Defaults to 8081.
port: 9090
# The address to listen for requests on. Defaults to all addresses.
address: "0.0.0.0"
# The path to expose the monitoring endpoint at. Defaults to `/metrics`
endpoint: "/metrics"
# Options for exposing web APIs.
web:
# Whether to enable web APIs.

View File

@ -58,6 +58,7 @@
"matrix-appservice-bridge": "8.0.0",
"parse-duration": "^1.0.2",
"pg": "^8.8.0",
"prom-client": "^14.1.0",
"shell-quote": "^1.7.3",
"ulidx": "^0.3.0",
"yaml": "^2.1.1"

View File

@ -38,6 +38,7 @@ import { ProtectionManager } from "./protections/ProtectionManager";
import { RoomMemberManager } from "./RoomMembers";
import ProtectedRoomsConfig from "./ProtectedRoomsConfig";
import { MatrixEmitter, MatrixSendClient } from "./MatrixEmitter";
import { OpenMetrics } from "./webapis/OpenMetrics";
export const STATE_NOT_STARTED = "not_started";
export const STATE_CHECKING_PERMISSIONS = "checking_permissions";
@ -64,6 +65,7 @@ export class Mjolnir {
private protectedRoomsConfig: ProtectedRoomsConfig;
public readonly protectedRoomsTracker: ProtectedRoomsSet;
private webapis: WebAPIs;
private openMetrics: OpenMetrics;
public taskQueue: ThrottlingQueue;
/**
* Reporting back to the management room.
@ -233,6 +235,7 @@ export class Mjolnir {
if (config.pollReports) {
this.reportPoller = new ReportPoller(this, this.reportManager);
}
this.openMetrics = new OpenMetrics(this.config);
// Setup join/leave listener
this.roomJoins = new RoomMemberManager(this.matrixEmitter);
this.taskQueue = new ThrottlingQueue(this, config.backgroundDelayMS);
@ -261,6 +264,7 @@ export class Mjolnir {
* Start Mjölnir.
*/
public async start() {
LogService.info("Mjolnir", "Starting Mjolnir instance");
try {
// Start the web server.
console.log("Starting web server");
@ -279,7 +283,7 @@ export class Mjolnir {
}
this.reportPoller.start(reportPollSetting.from);
}
await this.openMetrics.start();
// Load the state.
this.currentState = STATE_CHECKING_PERMISSIONS;
@ -330,6 +334,7 @@ export class Mjolnir {
this.matrixEmitter.stop();
this.webapis.stop();
this.reportPoller?.stop();
this.openMetrics.stop();
}
/**

View File

@ -20,6 +20,7 @@ import { DataStore, PgDataStore } from ".//datastore";
import { Api } from "./Api";
import { IConfig } from "./config/config";
import { AccessControl } from "./AccessControl";
import { OpenMetrics } from "../webapis/OpenMetrics";
const log = new Logger("AppService");
/**
@ -29,6 +30,7 @@ const log = new Logger("AppService");
export class MjolnirAppService {
private readonly api: Api;
private readonly openMetrics: OpenMetrics;
/**
* The constructor is private because we want to ensure intialization steps are followed,
@ -42,6 +44,7 @@ export class MjolnirAppService {
private readonly dataStore: DataStore,
) {
this.api = new Api(config.homeserver.url, mjolnirManager);
this.openMetrics = new OpenMetrics(config);
}
/**
@ -144,6 +147,8 @@ export class MjolnirAppService {
this.api.start(this.config.webAPI.port);
await this.bridge.listen(port);
log.info("MjolnirAppService started successfully");
await this.openMetrics.start();
}
/**
@ -153,6 +158,7 @@ export class MjolnirAppService {
await this.bridge.close();
await this.dataStore.close();
await this.api.close();
this.openMetrics.stop();
}
/**

View File

@ -1,6 +1,7 @@
import { Cli } from "matrix-appservice-bridge";
import { MjolnirAppService } from "./AppService";
import { IConfig } from "./config/config";
import * as utils from "../utils";
/**
* This file provides the entrypoint for the appservice mode for mjolnir.
@ -20,6 +21,8 @@ const cli = new Cli({
if (config === null) {
throw new Error("Couldn't load config");
}
utils.initializeSentry(config);
utils.initializeGlobalPerformanceMetrics(config);
await MjolnirAppService.run(port, config, cli.getRegistrationFilePath());
}
});

View File

@ -39,6 +39,42 @@ export interface IConfig {
accessControlList: string,
/** configuration for matrix-appservice-bridge's Logger */
logging?: LoggingOpts,
health?: {
// If specified, attempt to upload any crash statistics to sentry.
sentry?: {
dsn: string;
// Frequency of performance monitoring.
//
// A number in [0.0, 1.0], where 0.0 means "don't bother with tracing"
// and 1.0 means "trace performance at every opportunity".
tracesSampleRate: number;
};
openMetrics?: {
/**
* If `true`, expose a web server for server metrics, e.g. performance.
*
* Intended to be used with Prometheus or another Open Metrics scrapper.
*/
enabled: boolean;
/**
* The port on which to expose server metrics.
*/
port: number;
/**
* The path at which to collect health metrics.
*
* If unspecified, use `"/metrics"`.
*/
endpoint: string;
/**
* If specified, only serve this address mask.
*
* If unspecified, use 0.0.0.0 (accessible by any host).
*/
address: string;
}
}
}
export function read(configPath: string): IConfig {

View File

@ -19,6 +19,45 @@ import { load } from "js-yaml";
import { MatrixClient, LogService } from "matrix-bot-sdk";
import Config from "config";
export interface IHealthConfig {
health?: {
// If specified, attempt to upload any crash statistics to sentry.
sentry?: {
dsn: string;
// Frequency of performance monitoring.
//
// A number in [0.0, 1.0], where 0.0 means "don't bother with tracing"
// and 1.0 means "trace performance at every opportunity".
tracesSampleRate: number;
};
openMetrics?: {
/**
* If `true`, expose a web server for server metrics, e.g. performance.
*
* Intended to be used with Prometheus or another Open Metrics scrapper.
*/
enabled: boolean;
/**
* The port on which to expose server metrics.
*/
port: number;
/**
* The path at which to collect health metrics.
*
* If unspecified, use `"/metrics"`.
*/
endpoint: string;
/**
* If specified, only serve this address mask.
*
* If unspecified, use 0.0.0.0 (accessible by any host).
*/
address: string;
}
}
}
/**
* The configuration, as read from production.yaml
*
@ -99,6 +138,30 @@ export interface IConfig {
// and 1.0 means "trace performance at every opportunity".
tracesSampleRate: number;
};
openMetrics?: {
/**
* If `true`, expose a web server for server metrics, e.g. performance.
*
* Intended to be used with Prometheus or another Open Metrics scrapper.
*/
enabled: boolean;
/**
* The port on which to expose server metrics.
*/
port: number;
/**
* The path at which to collect health metrics.
*
* If unspecified, use `"/metrics"`.
*/
endpoint: string;
/**
* If specified, only serve this address mask.
*
* If unspecified, use 0.0.0.0 (accessible by any host).
*/
address: string;
}
};
web: {
enabled: boolean;
@ -167,6 +230,12 @@ const defaultConfig: IConfig = {
healthyStatus: 200,
unhealthyStatus: 418,
},
openMetrics: {
enabled: false,
port: 9090,
address: "0.0.0.0",
endpoint: "/metrics",
}
},
web: {
enabled: false,

View File

@ -29,7 +29,7 @@ import {
import { read as configRead } from "./config";
import { Mjolnir } from "./Mjolnir";
import { initializeSentry, patchMatrixClient } from "./utils";
import { initializeSentry, initializeGlobalPerformanceMetrics, patchMatrixClient } from "./utils";
(async function () {
@ -46,6 +46,9 @@ import { initializeSentry, patchMatrixClient } from "./utils";
if (config.health.sentry) {
initializeSentry(config);
}
if (config.health.openMetrics?.enabled) {
initializeGlobalPerformanceMetrics(config);
}
const healthz = new Healthz(config);
healthz.isHealthy = false; // start off unhealthy
if (config.health.healthz.enabled) {

View File

@ -25,9 +25,10 @@ import { ClientRequest, IncomingMessage } from "http";
import { default as parseDuration } from "parse-duration";
import * as Sentry from '@sentry/node';
import * as _ from '@sentry/tracing'; // Performing the import activates tracing.
import { collectDefaultMetrics, Counter, Histogram, register } from "prom-client";
import ManagementRoomOutput from "./ManagementRoomOutput";
import { IConfig } from "./config";
import { IHealthConfig } from "./config";
import { MatrixSendClient } from "./MatrixEmitter";
// Define a few aliases to simplify parsing durations.
@ -401,17 +402,62 @@ export function patchMatrixClient() {
patchMatrixClientForRetry();
}
/**
* Initialize performance measurements for the matrix client.
*
* This method is idempotent. If `config` specifies that Open Metrics
* should not be used, it does nothing.
*/
export function initializeGlobalPerformanceMetrics(config: IHealthConfig) {
if (isGlobalPerformanceMetricsCollectorInitialized || !config.health?.openMetrics?.enabled) {
return;
}
// Collect the Prometheus-recommended metrics.
collectDefaultMetrics({ register });
// Collect matrix-bot-sdk-related metrics.
let originalRequestFn = getRequestFn();
let perfHistogram = new Histogram({
name: "mjolnir_performance_http_request",
help: "Duration of HTTP requests in seconds",
});
let successfulRequestsCounter = new Counter({
name: "mjolnir_status_api_request_pass",
help: "Number of successful API requests",
});
let failedRequestsCounter = new Counter({
name: "mjolnir_status_api_request_fail",
help: "Number of failed API requests",
});
setRequestFn(async (params: { [k: string]: any }, cb: any) => {
let timer = perfHistogram.startTimer();
return await originalRequestFn(params, function(error: object, response: any, body: string) {
// Stop timer before calling callback.
timer();
if (error) {
failedRequestsCounter.inc();
} else {
successfulRequestsCounter.inc();
}
cb(error, response, body);
});
});
isGlobalPerformanceMetricsCollectorInitialized = true;
}
let isGlobalPerformanceMetricsCollectorInitialized = false;
/**
* Initialize Sentry for error monitoring and reporting.
*
* This method is idempotent. If `config` specifies that Sentry
* should not be used, it does nothing.
*/
export function initializeSentry(config: IConfig) {
export function initializeSentry(config: IHealthConfig) {
if (sentryInitialized) {
return;
}
if (config.health.sentry) {
if (config.health?.sentry) {
// Configure error monitoring with Sentry.
let sentry = config.health.sentry;
Sentry.init({
@ -423,4 +469,4 @@ export function initializeSentry(config: IConfig) {
}
// Set to `true` once we have initialized `Sentry` to ensure
// that we do not attempt to initialize it more than once.
let sentryInitialized = false;
let sentryInitialized = false;

102
src/webapis/OpenMetrics.ts Normal file
View File

@ -0,0 +1,102 @@
/*
Copyright 2022 The Matrix.org Foundation C.I.C.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
import { Server } from "http";
import express from "express";
import { LogService } from "matrix-bot-sdk";
import { IHealthConfig } from "../config";
import { collectDefaultMetrics, register } from "prom-client";
export class OpenMetrics {
private webController: express.Express = express();
private httpServer?: Server;
constructor(private readonly config: IHealthConfig) {
// Setup JSON parsing.
this.webController.use(express.json());
}
/**
* Start accepting requests to the OpenMetrics API.
*
* Does nothing if openMetrics is disabled in the config.
*/
public async start() {
if (!this.config.health?.openMetrics?.enabled) {
LogService.info("OpenMetrics server is disabled.");
return;
}
// Make sure that we collect the Prometheus-recommended metrics.
try {
collectDefaultMetrics({ register });
} catch (ex) {
if (ex.message.startsWith("A metric with the name")) {
// `collectDefaultMetrics` throws this error if it is called
// more than once in the same process, as is the case during
// testing.
//
// Sadly, `register.clear()`, which should be designed to
// prevent this, seems to work asynchronously and non-deterministically,
// sometimes not clearing the register at all by the time we re-register
// default metrics and sometimes clearing metrics that we haven't registered
// yet.
//
// Just ignore this error.
} else {
throw ex;
}
}
LogService.info("Starting OpenMetrics server.");
this.httpServer = this.webController.listen(this.config.health.openMetrics!.port, this.config.health.openMetrics!.address);
this.webController.options(this.config.health.openMetrics!.address, async (_request, response) => {
// reply with CORS options
response.header("Access-Control-Allow-Origin", "*");
response.header("Access-Control-Allow-Headers", "X-Requested-With, Content-Type, Authorization, Date");
response.header("Access-Control-Allow-Methods", "POST, OPTIONS");
response.status(200);
return response.send();
});
// Respond to Prometheus collection.
LogService.info(`configuring GET ${this.config.health.openMetrics.endpoint}`);
this.webController.get(this.config.health.openMetrics.endpoint, async (_request, response) => {
// set CORS headers for the response
response.header("Access-Control-Allow-Origin", "*");
response.header("Access-Control-Allow-Headers", "X-Requested-With, Content-Type, Authorization, Date");
response.header("Access-Control-Allow-Methods", "POST, OPTIONS");
try {
response.set('Content-Type', register.contentType);
response.end(await register.metrics());
} catch (ex) {
response.status(500).end(ex);
}
});
LogService.info(`configuring GET ${this.config.health.openMetrics.endpoint}... DONE`);
LogService.info("OpenMetrics server ready.");
}
public stop() {
if (this.httpServer) {
LogService.info("Stopping OpenMetrics server.");
this.httpServer.close();
this.httpServer = undefined;
}
}
public get isEnabled(): boolean {
return !!this.httpServer
}
}

View File

@ -49,7 +49,7 @@ export class WebAPIs {
// configure /report API.
if (this.config.web.abuseReporting.enabled) {
console.log(`configuring ${API_PREFIX}/report/:room_id/:event_id...`);
LogService.info(`configuring ${API_PREFIX}/report/:room_id/:event_id...`);
this.webController.options(`${API_PREFIX}/report/:room_id/:event_id`, async (request, response) => {
// reply with CORS options
response.header("Access-Control-Allow-Origin", "*");
@ -66,7 +66,7 @@ export class WebAPIs {
response.header("Access-Control-Allow-Methods", "POST, OPTIONS");
await this.handleReport({ request, response, roomId: request.params.room_id, eventId: request.params.event_id })
});
console.log(`configuring ${API_PREFIX}/report/:room_id/:event_id... DONE`);
LogService.info(`configuring ${API_PREFIX}/report/:room_id/:event_id... DONE`);
}
// configure ruleServer API.
@ -88,7 +88,7 @@ export class WebAPIs {
public stop() {
if (this.httpServer) {
console.log("Stopping WebAPIs.");
LogService.info("Stopping WebAPIs.");
this.httpServer.close();
this.httpServer = undefined;
}

View File

@ -1,5 +1,6 @@
import { read as configRead } from "../../src/config";
import { makeMjolnir, teardownManagementRoom } from "./mjolnirSetupUtils";
import { register } from "prom-client";
// When Mjolnir starts (src/index.ts) it clobbers the config by resolving the management room
// alias specified in the config (config.managementRoom) and overwriting that with the room ID.

View File

@ -21,9 +21,10 @@ import {
LogLevel,
RichConsoleLogger
} from "matrix-bot-sdk";
import { Mjolnir} from '../../src/Mjolnir';
import { overrideRatelimitForUser, registerUser } from "./clientHelper";
import { initializeSentry, patchMatrixClient } from "../../src/utils";
import { initializeGlobalPerformanceMetrics, initializeSentry, patchMatrixClient } from "../../src/utils";
import { IConfig } from "../../src/config";
/**
@ -51,6 +52,8 @@ export async function ensureAliasedRoomExists(client: MatrixClient, alias: strin
async function configureMjolnir(config: IConfig) {
// Initialize error monitoring as early as possible.
initializeSentry(config);
initializeGlobalPerformanceMetrics(config);
try {
await registerUser(config.homeserverUrl, config.pantalaimon.username, config.pantalaimon.username, config.pantalaimon.password, true)
} catch (e) {

View File

@ -0,0 +1,43 @@
import { strict as assert } from "assert";
import { getRequestFn } from "matrix-bot-sdk";
import { IConfig } from "../../src/config";
async function fetchMetrics(config: IConfig): Promise<string> {
if (!config.health.openMetrics?.enabled) {
throw new TypeError("openMetrics deactivated, cannot fetch");
}
let uri = new URL("http://localhost");
uri.port = `${config.health.openMetrics!.port}`;
uri.pathname = config.health.openMetrics!.endpoint;
return await new Promise((resolve, reject) =>
getRequestFn()({
method: "GET",
uri
}, (error: object, _response: any, body: string) => {
if (error) {
reject(error);
} else {
resolve(body);
}
})
);
}
describe("Test that we can read metrics using the API", function() {
it('can fetch default metrics', async function() {
console.debug("config", this.mjolnir.config);
let metrics = await fetchMetrics(this.mjolnir.config);
console.debug("Got metrics", metrics);
// Sample of Prometheus-recommended metrics.
for (let name of ["process_cpu_seconds_total"]) {
assert(metrics.includes(name), `Metrics should contain default metric \`${name}\``);
}
// Sample of metrics that we're injecting.
for (let name of ["mjolnir_performance_http_request_sum", "mjolnir_status_api_request_pass", "mjolnir_status_api_request_fail"]) {
assert(metrics.includes(name), `Metrics should contain custom metric \`${name}\``);
}
})
});

View File

@ -27,5 +27,6 @@
"./test/integration/banListTest.ts",
"./test/integration/reportPollingTest",
"./test/integration/policyConsumptionTest.ts",
"./test/integration/openMetricsTest.ts",
]
}