Add 'failure_ts' column to 'destinations' table (#6016)

Track the time that a server started failing at, for general analysis purposes.
This commit is contained in:
Richard van der Hoff 2019-09-17 11:41:54 +01:00 committed by GitHub
parent 850dcfd2d3
commit 1e19ce00bf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 195 additions and 12 deletions

1
changelog.d/6016.misc Normal file
View File

@ -0,0 +1 @@
Add a 'failure_ts' column to the 'destinations' database table.

View File

@ -0,0 +1,25 @@
/* Copyright 2019 The Matrix.org Foundation C.I.C
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* Record the timestamp when a given server started failing
*/
ALTER TABLE destinations ADD failure_ts BIGINT;
/* as a rough approximation, we assume that the server started failing at
* retry_interval before the last retry
*/
UPDATE destinations SET failure_ts = retry_last_ts - retry_interval
WHERE retry_last_ts > 0;

View File

@ -165,7 +165,7 @@ class TransactionStore(SQLBaseStore):
txn, txn,
table="destinations", table="destinations",
keyvalues={"destination": destination}, keyvalues={"destination": destination},
retcols=("destination", "retry_last_ts", "retry_interval"), retcols=("destination", "failure_ts", "retry_last_ts", "retry_interval"),
allow_none=True, allow_none=True,
) )
@ -174,12 +174,15 @@ class TransactionStore(SQLBaseStore):
else: else:
return None return None
def set_destination_retry_timings(self, destination, retry_last_ts, retry_interval): def set_destination_retry_timings(
self, destination, failure_ts, retry_last_ts, retry_interval
):
"""Sets the current retry timings for a given destination. """Sets the current retry timings for a given destination.
Both timings should be zero if retrying is no longer occuring. Both timings should be zero if retrying is no longer occuring.
Args: Args:
destination (str) destination (str)
failure_ts (int|None) - when the server started failing (ms since epoch)
retry_last_ts (int) - time of last retry attempt in unix epoch ms retry_last_ts (int) - time of last retry attempt in unix epoch ms
retry_interval (int) - how long until next retry in ms retry_interval (int) - how long until next retry in ms
""" """
@ -189,12 +192,13 @@ class TransactionStore(SQLBaseStore):
"set_destination_retry_timings", "set_destination_retry_timings",
self._set_destination_retry_timings, self._set_destination_retry_timings,
destination, destination,
failure_ts,
retry_last_ts, retry_last_ts,
retry_interval, retry_interval,
) )
def _set_destination_retry_timings( def _set_destination_retry_timings(
self, txn, destination, retry_last_ts, retry_interval self, txn, destination, failure_ts, retry_last_ts, retry_interval
): ):
if self.database_engine.can_native_upsert: if self.database_engine.can_native_upsert:
@ -202,9 +206,12 @@ class TransactionStore(SQLBaseStore):
# resetting it) or greater than the existing retry interval. # resetting it) or greater than the existing retry interval.
sql = """ sql = """
INSERT INTO destinations (destination, retry_last_ts, retry_interval) INSERT INTO destinations (
VALUES (?, ?, ?) destination, failure_ts, retry_last_ts, retry_interval
)
VALUES (?, ?, ?, ?)
ON CONFLICT (destination) DO UPDATE SET ON CONFLICT (destination) DO UPDATE SET
failure_ts = EXCLUDED.failure_ts,
retry_last_ts = EXCLUDED.retry_last_ts, retry_last_ts = EXCLUDED.retry_last_ts,
retry_interval = EXCLUDED.retry_interval retry_interval = EXCLUDED.retry_interval
WHERE WHERE
@ -212,7 +219,7 @@ class TransactionStore(SQLBaseStore):
OR destinations.retry_interval < EXCLUDED.retry_interval OR destinations.retry_interval < EXCLUDED.retry_interval
""" """
txn.execute(sql, (destination, retry_last_ts, retry_interval)) txn.execute(sql, (destination, failure_ts, retry_last_ts, retry_interval))
return return
@ -225,7 +232,7 @@ class TransactionStore(SQLBaseStore):
txn, txn,
table="destinations", table="destinations",
keyvalues={"destination": destination}, keyvalues={"destination": destination},
retcols=("retry_last_ts", "retry_interval"), retcols=("failure_ts", "retry_last_ts", "retry_interval"),
allow_none=True, allow_none=True,
) )
@ -235,6 +242,7 @@ class TransactionStore(SQLBaseStore):
table="destinations", table="destinations",
values={ values={
"destination": destination, "destination": destination,
"failure_ts": failure_ts,
"retry_last_ts": retry_last_ts, "retry_last_ts": retry_last_ts,
"retry_interval": retry_interval, "retry_interval": retry_interval,
}, },
@ -245,6 +253,7 @@ class TransactionStore(SQLBaseStore):
"destinations", "destinations",
keyvalues={"destination": destination}, keyvalues={"destination": destination},
updatevalues={ updatevalues={
"failure_ts": failure_ts,
"retry_last_ts": retry_last_ts, "retry_last_ts": retry_last_ts,
"retry_interval": retry_interval, "retry_interval": retry_interval,
}, },

View File

@ -80,11 +80,13 @@ def get_retry_limiter(destination, clock, store, ignore_backoff=False, **kwargs)
# We aren't ready to retry that destination. # We aren't ready to retry that destination.
raise raise
""" """
failure_ts = None
retry_last_ts, retry_interval = (0, 0) retry_last_ts, retry_interval = (0, 0)
retry_timings = yield store.get_destination_retry_timings(destination) retry_timings = yield store.get_destination_retry_timings(destination)
if retry_timings: if retry_timings:
failure_ts = retry_timings["failure_ts"]
retry_last_ts, retry_interval = ( retry_last_ts, retry_interval = (
retry_timings["retry_last_ts"], retry_timings["retry_last_ts"],
retry_timings["retry_interval"], retry_timings["retry_interval"],
@ -108,6 +110,7 @@ def get_retry_limiter(destination, clock, store, ignore_backoff=False, **kwargs)
destination, destination,
clock, clock,
store, store,
failure_ts,
retry_interval, retry_interval,
backoff_on_failure=backoff_on_failure, backoff_on_failure=backoff_on_failure,
**kwargs **kwargs
@ -120,6 +123,7 @@ class RetryDestinationLimiter(object):
destination, destination,
clock, clock,
store, store,
failure_ts,
retry_interval, retry_interval,
backoff_on_404=False, backoff_on_404=False,
backoff_on_failure=True, backoff_on_failure=True,
@ -133,6 +137,8 @@ class RetryDestinationLimiter(object):
destination (str) destination (str)
clock (Clock) clock (Clock)
store (DataStore) store (DataStore)
failure_ts (int|None): when this destination started failing (in ms since
the epoch), or zero if the last request was successful
retry_interval (int): The next retry interval taken from the retry_interval (int): The next retry interval taken from the
database in milliseconds, or zero if the last request was database in milliseconds, or zero if the last request was
successful. successful.
@ -145,6 +151,7 @@ class RetryDestinationLimiter(object):
self.store = store self.store = store
self.destination = destination self.destination = destination
self.failure_ts = failure_ts
self.retry_interval = retry_interval self.retry_interval = retry_interval
self.backoff_on_404 = backoff_on_404 self.backoff_on_404 = backoff_on_404
self.backoff_on_failure = backoff_on_failure self.backoff_on_failure = backoff_on_failure
@ -186,6 +193,7 @@ class RetryDestinationLimiter(object):
logger.debug( logger.debug(
"Connection to %s was successful; clearing backoff", self.destination "Connection to %s was successful; clearing backoff", self.destination
) )
self.failure_ts = None
retry_last_ts = 0 retry_last_ts = 0
self.retry_interval = 0 self.retry_interval = 0
elif not self.backoff_on_failure: elif not self.backoff_on_failure:
@ -211,11 +219,17 @@ class RetryDestinationLimiter(object):
) )
retry_last_ts = int(self.clock.time_msec()) retry_last_ts = int(self.clock.time_msec())
if self.failure_ts is None:
self.failure_ts = retry_last_ts
@defer.inlineCallbacks @defer.inlineCallbacks
def store_retry_timings(): def store_retry_timings():
try: try:
yield self.store.set_destination_retry_timings( yield self.store.set_destination_retry_timings(
self.destination, retry_last_ts, self.retry_interval self.destination,
self.failure_ts,
retry_last_ts,
self.retry_interval,
) )
except Exception: except Exception:
logger.exception("Failed to store destination_retry_timings") logger.exception("Failed to store destination_retry_timings")

View File

@ -99,7 +99,12 @@ class TypingNotificationsTestCase(unittest.HomeserverTestCase):
self.event_source = hs.get_event_sources().sources["typing"] self.event_source = hs.get_event_sources().sources["typing"]
self.datastore = hs.get_datastore() self.datastore = hs.get_datastore()
retry_timings_res = {"destination": "", "retry_last_ts": 0, "retry_interval": 0} retry_timings_res = {
"destination": "",
"retry_last_ts": 0,
"retry_interval": 0,
"failure_ts": None,
}
self.datastore.get_destination_retry_timings.return_value = defer.succeed( self.datastore.get_destination_retry_timings.return_value = defer.succeed(
retry_timings_res retry_timings_res
) )

View File

@ -29,17 +29,19 @@ class TransactionStoreTestCase(HomeserverTestCase):
r = self.get_success(d) r = self.get_success(d)
self.assertIsNone(r) self.assertIsNone(r)
d = self.store.set_destination_retry_timings("example.com", 50, 100) d = self.store.set_destination_retry_timings("example.com", 1000, 50, 100)
self.get_success(d) self.get_success(d)
d = self.store.get_destination_retry_timings("example.com") d = self.store.get_destination_retry_timings("example.com")
r = self.get_success(d) r = self.get_success(d)
self.assert_dict({"retry_last_ts": 50, "retry_interval": 100}, r) self.assert_dict(
{"retry_last_ts": 50, "retry_interval": 100, "failure_ts": 1000}, r
)
def test_initial_set_transactions(self): def test_initial_set_transactions(self):
"""Tests that we can successfully set the destination retries (there """Tests that we can successfully set the destination retries (there
was a bug around invalidating the cache that broke this) was a bug around invalidating the cache that broke this)
""" """
d = self.store.set_destination_retry_timings("example.com", 50, 100) d = self.store.set_destination_retry_timings("example.com", 1000, 50, 100)
self.get_success(d) self.get_success(d)

View File

@ -0,0 +1,127 @@
# -*- coding: utf-8 -*-
# Copyright 2019 The Matrix.org Foundation C.I.C.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from synapse.util.retryutils import (
MIN_RETRY_INTERVAL,
RETRY_MULTIPLIER,
NotRetryingDestination,
get_retry_limiter,
)
from tests.unittest import HomeserverTestCase
class RetryLimiterTestCase(HomeserverTestCase):
def test_new_destination(self):
"""A happy-path case with a new destination and a successful operation"""
store = self.hs.get_datastore()
d = get_retry_limiter("test_dest", self.clock, store)
self.pump()
limiter = self.successResultOf(d)
# advance the clock a bit before making the request
self.pump(1)
with limiter:
pass
d = store.get_destination_retry_timings("test_dest")
self.pump()
new_timings = self.successResultOf(d)
self.assertIsNone(new_timings)
def test_limiter(self):
"""General test case which walks through the process of a failing request"""
store = self.hs.get_datastore()
d = get_retry_limiter("test_dest", self.clock, store)
self.pump()
limiter = self.successResultOf(d)
self.pump(1)
try:
with limiter:
self.pump(1)
failure_ts = self.clock.time_msec()
raise AssertionError("argh")
except AssertionError:
pass
# wait for the update to land
self.pump()
d = store.get_destination_retry_timings("test_dest")
self.pump()
new_timings = self.successResultOf(d)
self.assertEqual(new_timings["failure_ts"], failure_ts)
self.assertEqual(new_timings["retry_last_ts"], failure_ts)
self.assertEqual(new_timings["retry_interval"], MIN_RETRY_INTERVAL)
# now if we try again we should get a failure
d = get_retry_limiter("test_dest", self.clock, store)
self.pump()
self.failureResultOf(d, NotRetryingDestination)
#
# advance the clock and try again
#
self.pump(MIN_RETRY_INTERVAL)
d = get_retry_limiter("test_dest", self.clock, store)
self.pump()
limiter = self.successResultOf(d)
self.pump(1)
try:
with limiter:
self.pump(1)
retry_ts = self.clock.time_msec()
raise AssertionError("argh")
except AssertionError:
pass
# wait for the update to land
self.pump()
d = store.get_destination_retry_timings("test_dest")
self.pump()
new_timings = self.successResultOf(d)
self.assertEqual(new_timings["failure_ts"], failure_ts)
self.assertEqual(new_timings["retry_last_ts"], retry_ts)
self.assertGreaterEqual(
new_timings["retry_interval"], MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 0.5
)
self.assertLessEqual(
new_timings["retry_interval"], MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0
)
#
# one more go, with success
#
self.pump(MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0)
d = get_retry_limiter("test_dest", self.clock, store)
self.pump()
limiter = self.successResultOf(d)
self.pump(1)
with limiter:
self.pump(1)
# wait for the update to land
self.pump()
d = store.get_destination_retry_timings("test_dest")
self.pump()
new_timings = self.successResultOf(d)
self.assertIsNone(new_timings)