mirror of
https://git.anonymousland.org/anonymousland/synapse.git
synced 2025-08-07 11:32:13 -04:00
Add optional ICU support for user search (#14464)
Fixes #13655 This change uses ICU (International Components for Unicode) to improve boundary detection in user search. This change also adds a new dependency on libicu-dev and pkg-config for the Debian packages, which are available in all supported distros.
This commit is contained in:
parent
a5d8fee097
commit
2a3cd59dd0
10 changed files with 166 additions and 6 deletions
|
@ -11,6 +11,7 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import re
|
||||
from typing import Any, Dict, Set, Tuple
|
||||
from unittest import mock
|
||||
from unittest.mock import Mock, patch
|
||||
|
@ -30,6 +31,12 @@ from synapse.util import Clock
|
|||
from tests.test_utils.event_injection import inject_member_event
|
||||
from tests.unittest import HomeserverTestCase, override_config
|
||||
|
||||
try:
|
||||
import icu
|
||||
except ImportError:
|
||||
icu = None # type: ignore
|
||||
|
||||
|
||||
ALICE = "@alice:a"
|
||||
BOB = "@bob:b"
|
||||
BOBBY = "@bobby:a"
|
||||
|
@ -467,3 +474,39 @@ class UserDirectoryStoreTestCase(HomeserverTestCase):
|
|||
r["results"][0],
|
||||
{"user_id": BELA, "display_name": "Bela", "avatar_url": None},
|
||||
)
|
||||
|
||||
|
||||
class UserDirectoryICUTestCase(HomeserverTestCase):
|
||||
if not icu:
|
||||
skip = "Requires PyICU"
|
||||
|
||||
def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
|
||||
self.store = hs.get_datastores().main
|
||||
self.user_dir_helper = GetUserDirectoryTables(self.store)
|
||||
|
||||
def test_icu_word_boundary(self) -> None:
|
||||
"""Tests that we correctly detect word boundaries when ICU (International
|
||||
Components for Unicode) support is available.
|
||||
"""
|
||||
|
||||
display_name = "Gáo"
|
||||
|
||||
# This word is not broken down correctly by Python's regular expressions,
|
||||
# likely because á is actually a lowercase a followed by a U+0301 combining
|
||||
# acute accent. This is specifically something that ICU support fixes.
|
||||
matches = re.findall(r"([\w\-]+)", display_name, re.UNICODE)
|
||||
self.assertEqual(len(matches), 2)
|
||||
|
||||
self.get_success(
|
||||
self.store.update_profile_in_user_dir(ALICE, display_name, None)
|
||||
)
|
||||
self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE,)))
|
||||
|
||||
# Check that searching for this user yields the correct result.
|
||||
r = self.get_success(self.store.search_user_dir(BOB, display_name, 10))
|
||||
self.assertFalse(r["limited"])
|
||||
self.assertEqual(len(r["results"]), 1)
|
||||
self.assertDictEqual(
|
||||
r["results"][0],
|
||||
{"user_id": ALICE, "display_name": display_name, "avatar_url": None},
|
||||
)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue