Track ongoing event fetches correctly in the presence of failure (#11240)

When an event fetcher aborts due to an exception, `_event_fetch_ongoing`
must be decremented, otherwise the event fetcher would never be
replaced. If enough event fetchers were to fail, no more events would be
fetched and requests would get stuck waiting for events.
This commit is contained in:
Sean Quah 2021-11-04 10:33:53 +00:00 committed by GitHub
parent a271e233e9
commit 8eec25a1d9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 33 additions and 20 deletions

1
changelog.d/11240.bugfix Normal file
View File

@ -0,0 +1 @@
Fix a long-standing bug where all requests that read events from the database could get stuck as a result of losing the database connection.

View File

@ -28,6 +28,7 @@ from typing import (
import attr import attr
from constantly import NamedConstant, Names from constantly import NamedConstant, Names
from prometheus_client import Gauge
from typing_extensions import Literal from typing_extensions import Literal
from twisted.internet import defer from twisted.internet import defer
@ -81,6 +82,12 @@ EVENT_QUEUE_ITERATIONS = 3 # No. times we block waiting for requests for events
EVENT_QUEUE_TIMEOUT_S = 0.1 # Timeout when waiting for requests for events EVENT_QUEUE_TIMEOUT_S = 0.1 # Timeout when waiting for requests for events
event_fetch_ongoing_gauge = Gauge(
"synapse_event_fetch_ongoing",
"The number of event fetchers that are running",
)
@attr.s(slots=True, auto_attribs=True) @attr.s(slots=True, auto_attribs=True)
class _EventCacheEntry: class _EventCacheEntry:
event: EventBase event: EventBase
@ -222,6 +229,7 @@ class EventsWorkerStore(SQLBaseStore):
self._event_fetch_lock = threading.Condition() self._event_fetch_lock = threading.Condition()
self._event_fetch_list = [] self._event_fetch_list = []
self._event_fetch_ongoing = 0 self._event_fetch_ongoing = 0
event_fetch_ongoing_gauge.set(self._event_fetch_ongoing)
# We define this sequence here so that it can be referenced from both # We define this sequence here so that it can be referenced from both
# the DataStore and PersistEventStore. # the DataStore and PersistEventStore.
@ -732,6 +740,7 @@ class EventsWorkerStore(SQLBaseStore):
"""Takes a database connection and waits for requests for events from """Takes a database connection and waits for requests for events from
the _event_fetch_list queue. the _event_fetch_list queue.
""" """
try:
i = 0 i = 0
while True: while True:
with self._event_fetch_lock: with self._event_fetch_lock:
@ -745,8 +754,7 @@ class EventsWorkerStore(SQLBaseStore):
or single_threaded or single_threaded
or i > EVENT_QUEUE_ITERATIONS or i > EVENT_QUEUE_ITERATIONS
): ):
self._event_fetch_ongoing -= 1 break
return
else: else:
self._event_fetch_lock.wait(EVENT_QUEUE_TIMEOUT_S) self._event_fetch_lock.wait(EVENT_QUEUE_TIMEOUT_S)
i += 1 i += 1
@ -754,6 +762,9 @@ class EventsWorkerStore(SQLBaseStore):
i = 0 i = 0
self._fetch_event_list(conn, event_list) self._fetch_event_list(conn, event_list)
finally:
self._event_fetch_ongoing -= 1
event_fetch_ongoing_gauge.set(self._event_fetch_ongoing)
def _fetch_event_list( def _fetch_event_list(
self, conn: Connection, event_list: List[Tuple[List[str], defer.Deferred]] self, conn: Connection, event_list: List[Tuple[List[str], defer.Deferred]]
@ -977,6 +988,7 @@ class EventsWorkerStore(SQLBaseStore):
if self._event_fetch_ongoing < EVENT_QUEUE_THREADS: if self._event_fetch_ongoing < EVENT_QUEUE_THREADS:
self._event_fetch_ongoing += 1 self._event_fetch_ongoing += 1
event_fetch_ongoing_gauge.set(self._event_fetch_ongoing)
should_start = True should_start = True
else: else:
should_start = False should_start = False