Add optional ICU support for user search (#14464)

Fixes #13655

This change uses ICU (International Components for Unicode) to improve boundary detection in user search.

This change also adds a new dependency on libicu-dev and pkg-config for the Debian packages, which are available in all supported distros.
This commit is contained in:
Brendan Abolivier 2022-12-12 13:21:17 +01:00 committed by GitHub
parent a5d8fee097
commit 2a3cd59dd0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 166 additions and 6 deletions

View file

@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
from typing import Any, Dict, Set, Tuple
from unittest import mock
from unittest.mock import Mock, patch
@ -30,6 +31,12 @@ from synapse.util import Clock
from tests.test_utils.event_injection import inject_member_event
from tests.unittest import HomeserverTestCase, override_config
try:
import icu
except ImportError:
icu = None # type: ignore
ALICE = "@alice:a"
BOB = "@bob:b"
BOBBY = "@bobby:a"
@ -467,3 +474,39 @@ class UserDirectoryStoreTestCase(HomeserverTestCase):
r["results"][0],
{"user_id": BELA, "display_name": "Bela", "avatar_url": None},
)
class UserDirectoryICUTestCase(HomeserverTestCase):
if not icu:
skip = "Requires PyICU"
def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
self.store = hs.get_datastores().main
self.user_dir_helper = GetUserDirectoryTables(self.store)
def test_icu_word_boundary(self) -> None:
"""Tests that we correctly detect word boundaries when ICU (International
Components for Unicode) support is available.
"""
display_name = "Gáo"
# This word is not broken down correctly by Python's regular expressions,
# likely because á is actually a lowercase a followed by a U+0301 combining
# acute accent. This is specifically something that ICU support fixes.
matches = re.findall(r"([\w\-]+)", display_name, re.UNICODE)
self.assertEqual(len(matches), 2)
self.get_success(
self.store.update_profile_in_user_dir(ALICE, display_name, None)
)
self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE,)))
# Check that searching for this user yields the correct result.
r = self.get_success(self.store.search_user_dir(BOB, display_name, 10))
self.assertFalse(r["limited"])
self.assertEqual(len(r["results"]), 1)
self.assertDictEqual(
r["results"][0],
{"user_id": ALICE, "display_name": display_name, "avatar_url": None},
)