Merge pull request #2500 from matrix-org/dbkr/fix_word_boundary_mentions

Fix notif kws that start/end with non-word chars
2024-10-01 01:36:05 -04:00 · 2017-10-05 12:27:59 +01:00 · 2017-10-05 12:27:59 +01:00 · 44f8e383f3
commit 44f8e383f3
parent 93b0cf7a99 0c8da8b519
1 changed files with 14 additions and 2 deletions
--- a/synapse/push/push_rule_evaluator.py
+++ b/synapse/push/push_rule_evaluator.py
@ -183,7 +183,7 @@ def _glob_to_re(glob, word_boundary):
            r,
        )
        if word_boundary:
-            r = r"\b%s\b" % (r,)
+            r = _re_word_boundary(r)

            return re.compile(r, flags=re.IGNORECASE)
        else:
@ -192,7 +192,7 @@ def _glob_to_re(glob, word_boundary):
            return re.compile(r, flags=re.IGNORECASE)
    elif word_boundary:
        r = re.escape(glob)
-        r = r"\b%s\b" % (r,)
+        r = _re_word_boundary(r)

        return re.compile(r, flags=re.IGNORECASE)
    else:
@ -200,6 +200,18 @@ def _glob_to_re(glob, word_boundary):
        return re.compile(r, flags=re.IGNORECASE)


+def _re_word_boundary(r):
+    """
+    Adds word boundary characters to the start and end of an
+    expression to require that the match occur as a whole word,
+    but do so respecting the fact that strings starting or ending
+    with non-word characters will change word boundaries.
+    """
+    # we can't use \b as it chokes on unicode. however \W seems to be okay
+    # as shorthand for [^0-9A-Za-z_].
+    return r"(^|\W)%s(\W|$)" % (r,)
+
+
 def _flatten_dict(d, prefix=[], result=None):
    if result is None:
        result = {}