mirror of
https://git.anonymousland.org/anonymousland/synapse.git
synced 2025-07-27 02:45:16 -04:00
Add optional ICU support for user search (#14464)
Fixes #13655 This change uses ICU (International Components for Unicode) to improve boundary detection in user search. This change also adds a new dependency on libicu-dev and pkg-config for the Debian packages, which are available in all supported distros.
This commit is contained in:
parent
a5d8fee097
commit
2a3cd59dd0
10 changed files with 166 additions and 6 deletions
|
@ -26,6 +26,14 @@ from typing import (
|
|||
cast,
|
||||
)
|
||||
|
||||
try:
|
||||
# Figure out if ICU support is available for searching users.
|
||||
import icu
|
||||
|
||||
USE_ICU = True
|
||||
except ModuleNotFoundError:
|
||||
USE_ICU = False
|
||||
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
from synapse.api.errors import StoreError
|
||||
|
@ -900,7 +908,7 @@ def _parse_query_sqlite(search_term: str) -> str:
|
|||
"""
|
||||
|
||||
# Pull out the individual words, discarding any non-word characters.
|
||||
results = re.findall(r"([\w\-]+)", search_term, re.UNICODE)
|
||||
results = _parse_words(search_term)
|
||||
return " & ".join("(%s* OR %s)" % (result, result) for result in results)
|
||||
|
||||
|
||||
|
@ -910,12 +918,63 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]:
|
|||
We use this so that we can add prefix matching, which isn't something
|
||||
that is supported by default.
|
||||
"""
|
||||
|
||||
# Pull out the individual words, discarding any non-word characters.
|
||||
results = re.findall(r"([\w\-]+)", search_term, re.UNICODE)
|
||||
results = _parse_words(search_term)
|
||||
|
||||
both = " & ".join("(%s:* | %s)" % (result, result) for result in results)
|
||||
exact = " & ".join("%s" % (result,) for result in results)
|
||||
prefix = " & ".join("%s:*" % (result,) for result in results)
|
||||
|
||||
return both, exact, prefix
|
||||
|
||||
|
||||
def _parse_words(search_term: str) -> List[str]:
|
||||
"""Split the provided search string into a list of its words.
|
||||
|
||||
If support for ICU (International Components for Unicode) is available, use it.
|
||||
Otherwise, fall back to using a regex to detect word boundaries. This latter
|
||||
solution works well enough for most latin-based languages, but doesn't work as well
|
||||
with other languages.
|
||||
|
||||
Args:
|
||||
search_term: The search string.
|
||||
|
||||
Returns:
|
||||
A list of the words in the search string.
|
||||
"""
|
||||
if USE_ICU:
|
||||
return _parse_words_with_icu(search_term)
|
||||
|
||||
return re.findall(r"([\w\-]+)", search_term, re.UNICODE)
|
||||
|
||||
|
||||
def _parse_words_with_icu(search_term: str) -> List[str]:
|
||||
"""Break down the provided search string into its individual words using ICU
|
||||
(International Components for Unicode).
|
||||
|
||||
Args:
|
||||
search_term: The search string.
|
||||
|
||||
Returns:
|
||||
A list of the words in the search string.
|
||||
"""
|
||||
results = []
|
||||
breaker = icu.BreakIterator.createWordInstance(icu.Locale.getDefault())
|
||||
breaker.setText(search_term)
|
||||
i = 0
|
||||
while True:
|
||||
j = breaker.nextBoundary()
|
||||
if j < 0:
|
||||
break
|
||||
|
||||
result = search_term[i:j]
|
||||
|
||||
# libicu considers spaces and punctuation between words as words, but we don't
|
||||
# want to include those in results as they would result in syntax errors in SQL
|
||||
# queries (e.g. "foo bar" would result in the search query including "foo & &
|
||||
# bar").
|
||||
if len(re.findall(r"([\w\-]+)", result, re.UNICODE)):
|
||||
results.append(result)
|
||||
|
||||
i = j
|
||||
|
||||
return results
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue