mirror of
https://git.anonymousland.org/anonymousland/synapse.git
synced 2025-08-09 08:42:11 -04:00
Improve handling of non-ASCII characters in user directory search (#15143)
* Fix a long-standing bug where non-ASCII characters in search terms, including accented letters, would not match characters in a different case. * Fix a long-standing bug where search terms using combining accents would not match display names using precomposed accents and vice versa. To fully take effect, the user directory must be rebuilt after this change. Fixes #14630. Signed-off-by: Sean Quah <seanq@matrix.org>
This commit is contained in:
parent
682151a464
commit
335f52d595
3 changed files with 184 additions and 2 deletions
|
@ -504,6 +504,139 @@ class UserDirectoryStoreTestCase(HomeserverTestCase):
|
|||
{"user_id": BELA, "display_name": "Bela", "avatar_url": None},
|
||||
)
|
||||
|
||||
@override_config({"user_directory": {"search_all_users": True}})
|
||||
def test_search_user_dir_ascii_case_insensitivity(self) -> None:
|
||||
"""Tests that a user can look up another user by searching for their name in a
|
||||
different case.
|
||||
"""
|
||||
CHARLIE = "@someuser:example.org"
|
||||
self.get_success(
|
||||
self.store.update_profile_in_user_dir(CHARLIE, "Charlie", None)
|
||||
)
|
||||
|
||||
r = self.get_success(self.store.search_user_dir(ALICE, "cHARLIE", 10))
|
||||
self.assertFalse(r["limited"])
|
||||
self.assertEqual(1, len(r["results"]))
|
||||
self.assertDictEqual(
|
||||
r["results"][0],
|
||||
{"user_id": CHARLIE, "display_name": "Charlie", "avatar_url": None},
|
||||
)
|
||||
|
||||
@override_config({"user_directory": {"search_all_users": True}})
|
||||
def test_search_user_dir_unicode_case_insensitivity(self) -> None:
|
||||
"""Tests that a user can look up another user by searching for their name in a
|
||||
different case.
|
||||
"""
|
||||
IVAN = "@someuser:example.org"
|
||||
self.get_success(self.store.update_profile_in_user_dir(IVAN, "Иван", None))
|
||||
|
||||
r = self.get_success(self.store.search_user_dir(ALICE, "иВАН", 10))
|
||||
self.assertFalse(r["limited"])
|
||||
self.assertEqual(1, len(r["results"]))
|
||||
self.assertDictEqual(
|
||||
r["results"][0],
|
||||
{"user_id": IVAN, "display_name": "Иван", "avatar_url": None},
|
||||
)
|
||||
|
||||
@override_config({"user_directory": {"search_all_users": True}})
|
||||
def test_search_user_dir_dotted_dotless_i_case_insensitivity(self) -> None:
|
||||
"""Tests that a user can look up another user by searching for their name in a
|
||||
different case, when their name contains dotted or dotless "i"s.
|
||||
|
||||
Some languages have dotted and dotless versions of "i", which are considered to
|
||||
be different letters: i <-> İ, ı <-> I. To make things difficult, they reuse the
|
||||
ASCII "i" and "I" code points, despite having different lowercase / uppercase
|
||||
forms.
|
||||
"""
|
||||
USER = "@someuser:example.org"
|
||||
|
||||
expected_matches = [
|
||||
# (search_term, display_name)
|
||||
# A search for "i" should match "İ".
|
||||
("iiiii", "İİİİİ"),
|
||||
# A search for "I" should match "ı".
|
||||
("IIIII", "ııııı"),
|
||||
# A search for "ı" should match "I".
|
||||
("ııııı", "IIIII"),
|
||||
# A search for "İ" should match "i".
|
||||
("İİİİİ", "iiiii"),
|
||||
]
|
||||
|
||||
for search_term, display_name in expected_matches:
|
||||
self.get_success(
|
||||
self.store.update_profile_in_user_dir(USER, display_name, None)
|
||||
)
|
||||
|
||||
r = self.get_success(self.store.search_user_dir(ALICE, search_term, 10))
|
||||
self.assertFalse(r["limited"])
|
||||
self.assertEqual(
|
||||
1,
|
||||
len(r["results"]),
|
||||
f"searching for {search_term!r} did not match {display_name!r}",
|
||||
)
|
||||
self.assertDictEqual(
|
||||
r["results"][0],
|
||||
{"user_id": USER, "display_name": display_name, "avatar_url": None},
|
||||
)
|
||||
|
||||
# We don't test for negative matches, to allow implementations that consider all
|
||||
# the i variants to be the same.
|
||||
|
||||
test_search_user_dir_dotted_dotless_i_case_insensitivity.skip = "not supported" # type: ignore
|
||||
|
||||
@override_config({"user_directory": {"search_all_users": True}})
|
||||
def test_search_user_dir_unicode_normalization(self) -> None:
|
||||
"""Tests that a user can look up another user by searching for their name with
|
||||
either composed or decomposed accents.
|
||||
"""
|
||||
AMELIE = "@someuser:example.org"
|
||||
|
||||
expected_matches = [
|
||||
# (search_term, display_name)
|
||||
("Ame\u0301lie", "Amélie"),
|
||||
("Amélie", "Ame\u0301lie"),
|
||||
]
|
||||
|
||||
for search_term, display_name in expected_matches:
|
||||
self.get_success(
|
||||
self.store.update_profile_in_user_dir(AMELIE, display_name, None)
|
||||
)
|
||||
|
||||
r = self.get_success(self.store.search_user_dir(ALICE, search_term, 10))
|
||||
self.assertFalse(r["limited"])
|
||||
self.assertEqual(
|
||||
1,
|
||||
len(r["results"]),
|
||||
f"searching for {search_term!r} did not match {display_name!r}",
|
||||
)
|
||||
self.assertDictEqual(
|
||||
r["results"][0],
|
||||
{"user_id": AMELIE, "display_name": display_name, "avatar_url": None},
|
||||
)
|
||||
|
||||
@override_config({"user_directory": {"search_all_users": True}})
|
||||
def test_search_user_dir_accent_insensitivity(self) -> None:
|
||||
"""Tests that a user can look up another user by searching for their name
|
||||
without any accents.
|
||||
"""
|
||||
AMELIE = "@someuser:example.org"
|
||||
self.get_success(self.store.update_profile_in_user_dir(AMELIE, "Amélie", None))
|
||||
|
||||
r = self.get_success(self.store.search_user_dir(ALICE, "amelie", 10))
|
||||
self.assertFalse(r["limited"])
|
||||
self.assertEqual(1, len(r["results"]))
|
||||
self.assertDictEqual(
|
||||
r["results"][0],
|
||||
{"user_id": AMELIE, "display_name": "Amélie", "avatar_url": None},
|
||||
)
|
||||
|
||||
# It may be desirable for "é"s in search terms to not match plain "e"s and we
|
||||
# really don't want "é"s in search terms to match "e"s with different accents.
|
||||
# But we don't test for this to allow implementations that consider all
|
||||
# "e"-lookalikes to be the same.
|
||||
|
||||
test_search_user_dir_accent_insensitivity.skip = "not supported yet" # type: ignore
|
||||
|
||||
|
||||
class UserDirectoryStoreTestCaseWithIcu(UserDirectoryStoreTestCase):
|
||||
use_icu = True
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue