synapse-product/synapse/storage/monthly_active_users.py

330 lines
13 KiB
Python
Raw Normal View History

2018-08-03 08:49:53 -04:00
# -*- coding: utf-8 -*-
# Copyright 2018 New Vector
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
2018-08-03 08:49:53 -04:00
2018-07-31 11:36:24 -04:00
from twisted.internet import defer
2018-08-03 08:49:53 -04:00
from synapse.util.caches.descriptors import cached
2018-07-31 11:36:24 -04:00
from ._base import SQLBaseStore
logger = logging.getLogger(__name__)
2018-08-06 18:25:25 -04:00
2018-08-03 12:55:50 -04:00
# Number of msec of granularity to store the monthly_active_user timestamp
# This means it is not necessary to update the table on every request
LAST_SEEN_GRANULARITY = 60 * 60 * 1000
2018-07-31 11:36:24 -04:00
class MonthlyActiveUsersStore(SQLBaseStore):
2018-08-02 08:47:19 -04:00
def __init__(self, dbconn, hs):
2018-07-31 11:36:24 -04:00
super(MonthlyActiveUsersStore, self).__init__(None, hs)
self._clock = hs.get_clock()
2018-08-03 08:49:53 -04:00
self.hs = hs
2018-10-24 12:18:08 -04:00
# Do not add more reserved users than the total allowable number
self._new_transaction(
dbconn,
"initialise_mau_threepids",
[],
[],
self._initialise_reserved_users,
hs.config.mau_limits_reserved_threepids[: self.hs.config.max_mau_value],
)
2018-10-24 12:18:08 -04:00
def _initialise_reserved_users(self, txn, threepids):
2018-10-23 10:27:20 -04:00
"""Ensures that reserved threepids are accounted for in the MAU table, should
be called on start up.
2018-10-24 12:18:08 -04:00
Args:
txn (cursor):
threepids (list[dict]): List of threepid dicts to reserve
"""
2018-10-24 12:18:08 -04:00
for tp in threepids:
user_id = self.get_user_id_by_threepid_txn(txn, tp["medium"], tp["address"])
2018-08-06 18:25:25 -04:00
if user_id:
is_support = self.is_support_user_txn(txn, user_id)
if not is_support:
self.upsert_monthly_active_user_txn(txn, user_id)
2018-08-06 18:25:25 -04:00
else:
logger.warning("mau limit reserved threepid %s not found in db" % tp)
2018-07-31 11:36:24 -04:00
@defer.inlineCallbacks
2018-07-31 11:36:24 -04:00
def reap_monthly_active_users(self):
"""Cleans out monthly active user table to ensure that no stale
2018-07-31 11:36:24 -04:00
entries exist.
2018-08-03 12:55:50 -04:00
Returns:
Deferred[]
2018-07-31 11:36:24 -04:00
"""
def _reap_users(txn, reserved_users):
"""
Args:
reserved_users (tuple): reserved users to preserve
"""
2018-08-03 08:49:53 -04:00
thirty_days_ago = int(self._clock.time_msec()) - (1000 * 60 * 60 * 24 * 30)
query_args = [thirty_days_ago]
base_sql = "DELETE FROM monthly_active_users WHERE timestamp < ?"
# Need if/else since 'AND user_id NOT IN ({})' fails on Postgres
# when len(reserved_users) == 0. Works fine on sqlite.
if len(reserved_users) > 0:
# questionmarks is a hack to overcome sqlite not supporting
# tuples in 'WHERE IN %s'
question_marks = ",".join("?" * len(reserved_users))
query_args.extend(reserved_users)
sql = base_sql + " AND user_id NOT IN ({})".format(question_marks)
else:
sql = base_sql
txn.execute(sql, query_args)
2018-08-06 16:55:54 -04:00
max_mau_value = self.hs.config.max_mau_value
if self.hs.config.limit_usage_by_mau:
# If MAU user count still exceeds the MAU threshold, then delete on
# a least recently active basis.
# Note it is not possible to write this query using OFFSET due to
# incompatibilities in how sqlite and postgres support the feature.
# sqlite requires 'LIMIT -1 OFFSET ?', the LIMIT must be present
# While Postgres does not require 'LIMIT', but also does not support
# negative LIMIT values. So there is no way to write it that both can
# support
if len(reserved_users) == 0:
sql = """
DELETE FROM monthly_active_users
WHERE user_id NOT IN (
SELECT user_id FROM monthly_active_users
ORDER BY timestamp DESC
LIMIT ?
)
"""
txn.execute(sql, (max_mau_value,))
# Need if/else since 'AND user_id NOT IN ({})' fails on Postgres
# when len(reserved_users) == 0. Works fine on sqlite.
else:
# Must be >= 0 for postgres
num_of_non_reserved_users_to_remove = max(
max_mau_value - len(reserved_users), 0
)
# It is important to filter reserved users twice to guard
# against the case where the reserved user is present in the
# SELECT, meaning that a legitmate mau is deleted.
sql = """
DELETE FROM monthly_active_users
WHERE user_id NOT IN (
SELECT user_id FROM monthly_active_users
WHERE user_id NOT IN ({})
ORDER BY timestamp DESC
LIMIT ?
)
AND user_id NOT IN ({})
""".format(
question_marks, question_marks
)
2018-08-02 17:41:05 -04:00
query_args = [
*reserved_users,
num_of_non_reserved_users_to_remove,
*reserved_users,
]
txn.execute(sql, query_args)
reserved_users = yield self.get_registered_reserved_users()
yield self.runInteraction(
"reap_monthly_active_users", _reap_users, reserved_users
)
2018-08-03 08:49:53 -04:00
# It seems poor to invalidate the whole cache, Postgres supports
# 'Returning' which would allow me to invalidate only the
# specific users, but sqlite has no way to do this and instead
# I would need to SELECT and the DELETE which without locking
# is racy.
# Have resolved to invalidate the whole cache for now and do
# something about it if and when the perf becomes significant
2018-08-09 13:02:12 -04:00
self.user_last_seen_monthly_active.invalidate_all()
2018-08-03 08:49:53 -04:00
self.get_monthly_active_count.invalidate_all()
2018-08-02 17:41:05 -04:00
2018-08-03 08:49:53 -04:00
@cached(num_args=0)
2018-07-31 11:36:24 -04:00
def get_monthly_active_count(self):
2018-08-06 18:25:25 -04:00
"""Generates current count of monthly active users
2018-08-03 12:55:50 -04:00
Returns:
Defered[int]: Number of current monthly active users
2018-07-31 11:36:24 -04:00
"""
2018-08-03 08:49:53 -04:00
2018-07-31 11:36:24 -04:00
def _count_users(txn):
2018-08-01 07:03:57 -04:00
sql = "SELECT COALESCE(count(*), 0) FROM monthly_active_users"
2018-07-31 11:36:24 -04:00
txn.execute(sql)
count, = txn.fetchone()
return count
2018-07-31 11:36:24 -04:00
return self.runInteraction("count_users", _count_users)
2018-09-12 06:58:52 -04:00
@defer.inlineCallbacks
def get_registered_reserved_users(self):
"""Of the reserved threepids defined in config, which are associated
2018-09-12 06:58:52 -04:00
with registered users?
Returns:
Defered[list]: Real reserved users
2018-09-12 06:58:52 -04:00
"""
users = []
for tp in self.hs.config.mau_limits_reserved_threepids[
: self.hs.config.max_mau_value
]:
2018-09-12 06:58:52 -04:00
user_id = yield self.hs.get_datastore().get_user_id_by_threepid(
tp["medium"], tp["address"]
)
if user_id:
users.append(user_id)
return users
2018-09-12 06:58:52 -04:00
2018-08-23 05:28:10 -04:00
@defer.inlineCallbacks
def upsert_monthly_active_user(self, user_id):
2018-10-24 12:18:08 -04:00
"""Updates or inserts the user into the monthly active user table, which
is used to track the current MAU usage of the server
Args:
user_id (str): user to add/update
"""
# Support user never to be included in MAU stats. Note I can't easily call this
# from upsert_monthly_active_user_txn because then I need a _txn form of
# is_support_user which is complicated because I want to cache the result.
# Therefore I call it here and ignore the case where
# upsert_monthly_active_user_txn is called directly from
# _initialise_reserved_users reasoning that it would be very strange to
# include a support user in this context.
is_support = yield self.is_support_user(user_id)
if is_support:
return
2019-01-28 10:43:32 -05:00
yield self.runInteraction(
"upsert_monthly_active_user", self.upsert_monthly_active_user_txn, user_id
)
2018-10-25 09:58:59 -04:00
2019-01-28 10:43:32 -05:00
user_in_mau = self.user_last_seen_monthly_active.cache.get(
(user_id,), None, update_metrics=False
2019-01-28 10:43:32 -05:00
)
if user_in_mau is None:
self.get_monthly_active_count.invalidate(())
2019-01-28 10:43:32 -05:00
self.user_last_seen_monthly_active.invalidate((user_id,))
def upsert_monthly_active_user_txn(self, txn, user_id):
2018-10-24 12:18:08 -04:00
"""Updates or inserts monthly active user member
2018-10-25 11:13:43 -04:00
2018-10-25 09:58:59 -04:00
Note that, after calling this method, it will generally be necessary
to invalidate the caches on user_last_seen_monthly_active and
get_monthly_active_count. We can't do that here, because we are running
in a database thread rather than the main thread, and we can't call
txn.call_after because txn may not be a LoggingTransaction.
2018-10-24 12:18:08 -04:00
We consciously do not call is_support_txn from this method because it
is not possible to cache the response. is_support_txn will be false in
almost all cases, so it seems reasonable to call it only for
upsert_monthly_active_user and to call is_support_txn manually
for cases where upsert_monthly_active_user_txn is called directly,
like _initialise_reserved_users
In short, don't call this method with support users. (Support users
should not appear in the MAU stats).
2018-10-24 12:18:08 -04:00
Args:
txn (cursor):
user_id (str): user to add/update
Returns:
bool: True if a new entry was created, False if an
2018-10-24 12:18:08 -04:00
existing one was updated.
2018-07-31 11:36:24 -04:00
"""
2018-09-26 11:16:41 -04:00
# Am consciously deciding to lock the table on the basis that is ought
# never be a big table and alternative approaches (batching multiple
# upserts into a single txn) introduced a lot of extra complexity.
# See https://github.com/matrix-org/synapse/issues/3854 for more
is_insert = self._simple_upsert_txn(
txn,
2018-07-31 11:36:24 -04:00
table="monthly_active_users",
keyvalues={"user_id": user_id},
values={"timestamp": int(self._clock.time_msec())},
2018-07-31 11:36:24 -04:00
)
2018-10-24 12:18:08 -04:00
return is_insert
2018-07-31 11:36:24 -04:00
@cached(num_args=1)
2018-08-09 13:02:12 -04:00
def user_last_seen_monthly_active(self, user_id):
2018-07-31 11:36:24 -04:00
"""
Checks if a given user is part of the monthly active user group
Arguments:
user_id (str): user to add/update
Return:
Deferred[int] : timestamp since last seen, None if never seen
2018-08-03 12:55:50 -04:00
2018-07-31 11:36:24 -04:00
"""
return self._simple_select_one_onecol(
2018-07-31 11:36:24 -04:00
table="monthly_active_users",
keyvalues={"user_id": user_id},
2018-08-03 12:55:50 -04:00
retcol="timestamp",
allow_none=True,
2018-08-09 13:02:12 -04:00
desc="user_last_seen_monthly_active",
)
2018-08-03 12:55:50 -04:00
@defer.inlineCallbacks
2018-09-06 12:22:53 -04:00
def populate_monthly_active_users(self, user_id):
2018-08-03 12:55:50 -04:00
"""Checks on the state of monthly active user limits and optionally
add the user to the monthly active tables
Args:
user_id(str): the user_id to query
"""
if self.hs.config.limit_usage_by_mau or self.hs.config.mau_stats_only:
2018-09-06 12:22:53 -04:00
# Trial users and guests should not be included as part of MAU group
is_guest = yield self.is_guest(user_id)
if is_guest:
return
2018-08-23 14:17:08 -04:00
is_trial = yield self.is_trial_user(user_id)
if is_trial:
return
2018-08-09 13:02:12 -04:00
last_seen_timestamp = yield self.user_last_seen_monthly_active(user_id)
2018-08-03 12:55:50 -04:00
now = self.hs.get_clock().time_msec()
# We want to reduce to the total number of db writes, and are happy
# to trade accuracy of timestamp in order to lighten load. This means
# We always insert new users (where MAU threshold has not been reached),
# but only update if we have not previously seen the user for
# LAST_SEEN_GRANULARITY ms
2018-08-03 12:55:50 -04:00
if last_seen_timestamp is None:
# In the case where mau_stats_only is True and limit_usage_by_mau is
# False, there is no point in checking get_monthly_active_count - it
# adds no value and will break the logic if max_mau_value is exceeded.
if not self.hs.config.limit_usage_by_mau:
2018-08-03 12:55:50 -04:00
yield self.upsert_monthly_active_user(user_id)
else:
count = yield self.get_monthly_active_count()
if count < self.hs.config.max_mau_value:
yield self.upsert_monthly_active_user(user_id)
2018-08-03 12:55:50 -04:00
elif now - last_seen_timestamp > LAST_SEEN_GRANULARITY:
yield self.upsert_monthly_active_user(user_id)