From 1cbc3f197cc1b9732649ffb769b05d90c0e904d7 Mon Sep 17 00:00:00 2001 From: reivilibre Date: Mon, 20 Feb 2023 12:00:18 +0000 Subject: [PATCH] Fix a bug introduced in Synapse v1.74.0 where searching with colons when using ICU for search term tokenisation would fail with an error. (#15079) Co-authored-by: David Robertson --- changelog.d/15079.bugfix | 1 + .../storage/databases/main/user_directory.py | 24 +++++-- tests/handlers/test_user_directory.py | 7 +++ tests/storage/test_user_directory.py | 63 ++++++++++++++++++- 4 files changed, 90 insertions(+), 5 deletions(-) create mode 100644 changelog.d/15079.bugfix diff --git a/changelog.d/15079.bugfix b/changelog.d/15079.bugfix new file mode 100644 index 000000000..907892c1e --- /dev/null +++ b/changelog.d/15079.bugfix @@ -0,0 +1 @@ +Fix a bug introduced in Synapse v1.74.0 where searching with colons when using ICU for search term tokenisation would fail with an error. \ No newline at end of file diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py index f6a6fd407..30af4b3b6 100644 --- a/synapse/storage/databases/main/user_directory.py +++ b/synapse/storage/databases/main/user_directory.py @@ -918,11 +918,19 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]: We use this so that we can add prefix matching, which isn't something that is supported by default. """ - results = _parse_words(search_term) + escaped_words = [] + for word in _parse_words(search_term): + # Postgres tsvector and tsquery quoting rules: + # words potentially containing punctuation should be quoted + # and then existing quotes and backslashes should be doubled + # See: https://www.postgresql.org/docs/current/datatype-textsearch.html#DATATYPE-TSQUERY - both = " & ".join("(%s:* | %s)" % (result, result) for result in results) - exact = " & ".join("%s" % (result,) for result in results) - prefix = " & ".join("%s:*" % (result,) for result in results) + quoted_word = word.replace("'", "''").replace("\\", "\\\\") + escaped_words.append(f"'{quoted_word}'") + + both = " & ".join("(%s:* | %s)" % (word, word) for word in escaped_words) + exact = " & ".join("%s" % (word,) for word in escaped_words) + prefix = " & ".join("%s:*" % (word,) for word in escaped_words) return both, exact, prefix @@ -944,6 +952,14 @@ def _parse_words(search_term: str) -> List[str]: if USE_ICU: return _parse_words_with_icu(search_term) + return _parse_words_with_regex(search_term) + + +def _parse_words_with_regex(search_term: str) -> List[str]: + """ + Break down search term into words, when we don't have ICU available. + See: `_parse_words` + """ return re.findall(r"([\w\-]+)", search_term, re.UNICODE) diff --git a/tests/handlers/test_user_directory.py b/tests/handlers/test_user_directory.py index f65a68b9c..a02c1c622 100644 --- a/tests/handlers/test_user_directory.py +++ b/tests/handlers/test_user_directory.py @@ -192,6 +192,13 @@ class UserDirectoryTestCase(unittest.HomeserverTestCase): self.helper.join(room, self.appservice.sender, tok=self.appservice.token) self._check_only_one_user_in_directory(user, room) + def test_search_term_with_colon_in_it_does_not_raise(self) -> None: + """ + Regression test: Test that search terms with colons in them are acceptable. + """ + u1 = self.register_user("user1", "pass") + self.get_success(self.handler.search_users(u1, "haha:paamayim-nekudotayim", 10)) + def test_user_not_in_users_table(self) -> None: """Unclear how it happens, but on matrix.org we've seen join events for users who aren't in the users table. Test that we don't fall over diff --git a/tests/storage/test_user_directory.py b/tests/storage/test_user_directory.py index f1ca523d2..2d169684c 100644 --- a/tests/storage/test_user_directory.py +++ b/tests/storage/test_user_directory.py @@ -25,6 +25,11 @@ from synapse.rest.client import login, register, room from synapse.server import HomeServer from synapse.storage import DataStore from synapse.storage.background_updates import _BackgroundUpdateHandler +from synapse.storage.databases.main import user_directory +from synapse.storage.databases.main.user_directory import ( + _parse_words_with_icu, + _parse_words_with_regex, +) from synapse.storage.roommember import ProfileInfo from synapse.util import Clock @@ -42,7 +47,7 @@ ALICE = "@alice:a" BOB = "@bob:b" BOBBY = "@bobby:a" # The localpart isn't 'Bela' on purpose so we can test looking up display names. -BELA = "@somenickname:a" +BELA = "@somenickname:example.org" class GetUserDirectoryTables: @@ -423,6 +428,8 @@ class UserDirectoryInitialPopulationTestcase(HomeserverTestCase): class UserDirectoryStoreTestCase(HomeserverTestCase): + use_icu = False + def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: self.store = hs.get_datastores().main @@ -434,6 +441,12 @@ class UserDirectoryStoreTestCase(HomeserverTestCase): self.get_success(self.store.update_profile_in_user_dir(BELA, "Bela", None)) self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE, BOB))) + self._restore_use_icu = user_directory.USE_ICU + user_directory.USE_ICU = self.use_icu + + def tearDown(self) -> None: + user_directory.USE_ICU = self._restore_use_icu + def test_search_user_dir(self) -> None: # normally when alice searches the directory she should just find # bob because bobby doesn't share a room with her. @@ -478,6 +491,26 @@ class UserDirectoryStoreTestCase(HomeserverTestCase): {"user_id": BELA, "display_name": "Bela", "avatar_url": None}, ) + @override_config({"user_directory": {"search_all_users": True}}) + def test_search_user_dir_start_of_user_id(self) -> None: + """Tests that a user can look up another user by searching for the start + of their user ID. + """ + r = self.get_success(self.store.search_user_dir(ALICE, "somenickname:exa", 10)) + self.assertFalse(r["limited"]) + self.assertEqual(1, len(r["results"])) + self.assertDictEqual( + r["results"][0], + {"user_id": BELA, "display_name": "Bela", "avatar_url": None}, + ) + + +class UserDirectoryStoreTestCaseWithIcu(UserDirectoryStoreTestCase): + use_icu = True + + if not icu: + skip = "Requires PyICU" + class UserDirectoryICUTestCase(HomeserverTestCase): if not icu: @@ -513,3 +546,31 @@ class UserDirectoryICUTestCase(HomeserverTestCase): r["results"][0], {"user_id": ALICE, "display_name": display_name, "avatar_url": None}, ) + + def test_icu_word_boundary_punctuation(self) -> None: + """ + Tests the behaviour of punctuation with the ICU tokeniser. + + Seems to depend on underlying version of ICU. + """ + + # Note: either tokenisation is fine, because Postgres actually splits + # words itself afterwards. + self.assertIn( + _parse_words_with_icu("lazy'fox jumped:over the.dog"), + ( + # ICU 66 on Ubuntu 20.04 + ["lazy'fox", "jumped", "over", "the", "dog"], + # ICU 70 on Ubuntu 22.04 + ["lazy'fox", "jumped:over", "the.dog"], + ), + ) + + def test_regex_word_boundary_punctuation(self) -> None: + """ + Tests the behaviour of punctuation with the non-ICU tokeniser + """ + self.assertEqual( + _parse_words_with_regex("lazy'fox jumped:over the.dog"), + ["lazy", "fox", "jumped", "over", "the", "dog"], + )