From b38bdae3a2e5b7cfe862580368b996b8d7dfa50f Mon Sep 17 00:00:00 2001
From: Richard van der Hoff <1389908+richvdh@users.noreply.github.com>
Date: Tue, 4 Jan 2022 16:36:33 +0000
Subject: [PATCH] Fix AssertionErrors after purging events (#11642)

* Fix AssertionErrors after purging events

If you purged a bunch of events from your database, and then restarted synapse
without receiving more events, then you would get a bunch of AssertionErrors on
restart.

This fixes the situation by rewinding the stream processors.

* `check-newsfragment`: ignore deleted newsfiles
---
 changelog.d/11536.bugfix           |  1 +
 changelog.d/11536.misc             |  1 -
 changelog.d/11642.bugfix           |  1 +
 scripts-dev/check-newsfragment     |  4 ++--
 synapse/handlers/stats.py          | 11 +++++++++++
 synapse/handlers/user_directory.py | 18 +++++++++++++++---
 6 files changed, 30 insertions(+), 6 deletions(-)
 create mode 100644 changelog.d/11536.bugfix
 delete mode 100644 changelog.d/11536.misc
 create mode 100644 changelog.d/11642.bugfix

diff --git a/changelog.d/11536.bugfix b/changelog.d/11536.bugfix
new file mode 100644
index 000000000..4a1b00725
--- /dev/null
+++ b/changelog.d/11536.bugfix
@@ -0,0 +1 @@
+Fix a long-standing bug which could cause `AssertionError`s to be written to the log when Synapse was restarted after purging events from the database.
diff --git a/changelog.d/11536.misc b/changelog.d/11536.misc
deleted file mode 100644
index b9191c111..000000000
--- a/changelog.d/11536.misc
+++ /dev/null
@@ -1 +0,0 @@
-Improvements to log messages around handling stream ids.
diff --git a/changelog.d/11642.bugfix b/changelog.d/11642.bugfix
new file mode 100644
index 000000000..4a1b00725
--- /dev/null
+++ b/changelog.d/11642.bugfix
@@ -0,0 +1 @@
+Fix a long-standing bug which could cause `AssertionError`s to be written to the log when Synapse was restarted after purging events from the database.
diff --git a/scripts-dev/check-newsfragment b/scripts-dev/check-newsfragment
index af4de345d..c764011d6 100755
--- a/scripts-dev/check-newsfragment
+++ b/scripts-dev/check-newsfragment
@@ -42,8 +42,8 @@ echo "--------------------------"
 echo
 
 matched=0
-for f in $(git diff --name-only FETCH_HEAD... -- changelog.d); do
-    # check that any modified newsfiles on this branch end with a full stop.
+for f in $(git diff --diff-filter=d --name-only FETCH_HEAD... -- changelog.d); do
+    # check that any added newsfiles on this branch end with a full stop.
     lastchar=$(tr -d '\n' < "$f" | tail -c 1)
     if [ "$lastchar" != '.' ] && [ "$lastchar" != '!' ]; then
         echo -e "\e[31mERROR: newsfragment $f does not end with a '.' or '!'\e[39m" >&2
diff --git a/synapse/handlers/stats.py b/synapse/handlers/stats.py
index bd3e6f2ec..29e41a4c7 100644
--- a/synapse/handlers/stats.py
+++ b/synapse/handlers/stats.py
@@ -80,6 +80,17 @@ class StatsHandler:
         # If self.pos is None then means we haven't fetched it from DB
         if self.pos is None:
             self.pos = await self.store.get_stats_positions()
+            room_max_stream_ordering = self.store.get_room_max_stream_ordering()
+            if self.pos > room_max_stream_ordering:
+                # apparently, we've processed more events than exist in the database!
+                # this can happen if events are removed with history purge or similar.
+                logger.warning(
+                    "Event stream ordering appears to have gone backwards (%i -> %i): "
+                    "rewinding stats processor",
+                    self.pos,
+                    room_max_stream_ordering,
+                )
+                self.pos = room_max_stream_ordering
 
         # Loop round handling deltas until we're up to date
 
diff --git a/synapse/handlers/user_directory.py b/synapse/handlers/user_directory.py
index a0eb45446..1565e034c 100644
--- a/synapse/handlers/user_directory.py
+++ b/synapse/handlers/user_directory.py
@@ -148,9 +148,21 @@ class UserDirectoryHandler(StateDeltasHandler):
         if self.pos is None:
             self.pos = await self.store.get_user_directory_stream_pos()
 
-        # If still None then the initial background update hasn't happened yet.
-        if self.pos is None:
-            return None
+            # If still None then the initial background update hasn't happened yet.
+            if self.pos is None:
+                return None
+
+            room_max_stream_ordering = self.store.get_room_max_stream_ordering()
+            if self.pos > room_max_stream_ordering:
+                # apparently, we've processed more events than exist in the database!
+                # this can happen if events are removed with history purge or similar.
+                logger.warning(
+                    "Event stream ordering appears to have gone backwards (%i -> %i): "
+                    "rewinding user directory processor",
+                    self.pos,
+                    room_max_stream_ordering,
+                )
+                self.pos = room_max_stream_ordering
 
         # Loop round handling deltas until we're up to date
         while True: