Keep track when we try and fail to process a pulled event (#13589)

We can follow-up this PR with:

 1. Only try to backfill from an event if we haven't tried recently -> https://github.com/matrix-org/synapse/issues/13622
 1. When we decide to backfill that event again, process it in the background so it doesn't block and make `/messages` slow when we know it will probably fail again -> https://github.com/matrix-org/synapse/issues/13623
 1. Generally track failures everywhere we try and fail to pull an event over federation -> https://github.com/matrix-org/synapse/issues/13700

Fix https://github.com/matrix-org/synapse/issues/13621

Part of https://github.com/matrix-org/synapse/issues/13356

Mentioned in [internal doc](https://docs.google.com/document/d/1lvUoVfYUiy6UaHB6Rb4HicjaJAU40-APue9Q4vzuW3c/edit#bookmark=id.qv7cj51sv9i5)
This commit is contained in:
Eric Eastwood 2022-09-14 13:57:50 -05:00 committed by GitHub
parent 666ae87729
commit 957e3d74fc
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 329 additions and 9 deletions

View file

@ -1294,6 +1294,51 @@ class EventFederationWorkerStore(SignatureWorkerStore, EventsWorkerStore, SQLBas
return event_id_results
@trace
async def record_event_failed_pull_attempt(
self, room_id: str, event_id: str, cause: str
) -> None:
"""
Record when we fail to pull an event over federation.
This information allows us to be more intelligent when we decide to
retry (we don't need to fail over and over) and we can process that
event in the background so we don't block on it each time.
Args:
room_id: The room where the event failed to pull from
event_id: The event that failed to be fetched or processed
cause: The error message or reason that we failed to pull the event
"""
await self.db_pool.runInteraction(
"record_event_failed_pull_attempt",
self._record_event_failed_pull_attempt_upsert_txn,
room_id,
event_id,
cause,
db_autocommit=True, # Safe as it's a single upsert
)
def _record_event_failed_pull_attempt_upsert_txn(
self,
txn: LoggingTransaction,
room_id: str,
event_id: str,
cause: str,
) -> None:
sql = """
INSERT INTO event_failed_pull_attempts (
room_id, event_id, num_attempts, last_attempt_ts, last_cause
)
VALUES (?, ?, ?, ?, ?)
ON CONFLICT (room_id, event_id) DO UPDATE SET
num_attempts=event_failed_pull_attempts.num_attempts + 1,
last_attempt_ts=EXCLUDED.last_attempt_ts,
last_cause=EXCLUDED.last_cause;
"""
txn.execute(sql, (room_id, event_id, 1, self._clock.time_msec(), cause))
async def get_missing_events(
self,
room_id: str,

View file

@ -2435,17 +2435,31 @@ class PersistEventsStore:
"DELETE FROM event_backward_extremities"
" WHERE event_id = ? AND room_id = ?"
)
backward_extremity_tuples_to_remove = [
(ev.event_id, ev.room_id)
for ev in events
if not ev.internal_metadata.is_outlier()
# If we encountered an event with no prev_events, then we might
# as well remove it now because it won't ever have anything else
# to backfill from.
or len(ev.prev_event_ids()) == 0
]
txn.execute_batch(
query,
[
(ev.event_id, ev.room_id)
for ev in events
if not ev.internal_metadata.is_outlier()
# If we encountered an event with no prev_events, then we might
# as well remove it now because it won't ever have anything else
# to backfill from.
or len(ev.prev_event_ids()) == 0
],
backward_extremity_tuples_to_remove,
)
# Clear out the failed backfill attempts after we successfully pulled
# the event. Since we no longer need these events as backward
# extremities, it also means that they won't be backfilled from again so
# we no longer need to store the backfill attempts around it.
query = """
DELETE FROM event_failed_pull_attempts
WHERE event_id = ? and room_id = ?
"""
txn.execute_batch(
query,
backward_extremity_tuples_to_remove,
)

View file

@ -81,6 +81,8 @@ Changes in SCHEMA_VERSION = 72:
Changes in SCHEMA_VERSION = 73;
- thread_id column is added to event_push_actions, event_push_actions_staging
event_push_summary, receipts_linearized, and receipts_graph.
- Add table `event_failed_pull_attempts` to keep track when we fail to pull
events over federation.
"""

View file

@ -0,0 +1,29 @@
/* Copyright 2022 The Matrix.org Foundation C.I.C
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-- Add a table that keeps track of when we failed to pull an event over
-- federation (via /backfill, `/event`, `/get_missing_events`, etc). This allows
-- us to be more intelligent when we decide to retry (we don't need to fail over
-- and over) and we can process that event in the background so we don't block
-- on it each time.
CREATE TABLE IF NOT EXISTS event_failed_pull_attempts(
room_id TEXT NOT NULL REFERENCES rooms (room_id),
event_id TEXT NOT NULL,
num_attempts INT NOT NULL,
last_attempt_ts BIGINT NOT NULL,
last_cause TEXT NOT NULL,
PRIMARY KEY (room_id, event_id)
);