Merge pull request from GHSA-x345-32rc-8h85

* tests for push rule pattern matching * tests for acl pattern matching * factor out common `re.escape` * Factor out common re.compile * Factor out common anchoring code * add word_boundary support to `glob_to_regex` * Use `glob_to_regex` in push rule evaluator NB that this drops support for character classes. I don't think anyone ever used them. * Improve efficiency of globs with multiple wildcards The idea here is that we compress multiple `*` globs into a single `.*`. We also need to consider `?`, since `*?*` is as hard to implement efficiently as `**`. * add assertion on regex pattern * Fix mypy * Simplify glob_to_regex * Inline the glob_to_regex helper function Signed-off-by: Dan Callahan <danc@element.io> * Moar comments Signed-off-by: Dan Callahan <danc@element.io> Co-authored-by: Dan Callahan <danc@element.io>
2025-11-30 20:36:36 -05:00 · 2021-05-11 10:47:23 +01:00 · 2021-05-11 10:47:23 +01:00 · 03318a766c
commit 03318a766c
parent 4df26abf28
6 changed files with 297 additions and 69 deletions
--- a/synapse/push/push_rule_evaluator.py
+++ b/synapse/push/push_rule_evaluator.py
@ -19,6 +19,7 @@ from typing import Any, Dict, List, Optional, Pattern, Tuple, Union

 from synapse.events import EventBase
 from synapse.types import UserID
+from synapse.util import glob_to_regex, re_word_boundary
 from synapse.util.caches.lrucache import LruCache

 logger = logging.getLogger(__name__)
@ -183,7 +184,7 @@ class PushRuleEvaluatorForEvent:
        r = regex_cache.get((display_name, False, True), None)
        if not r:
            r1 = re.escape(display_name)
-            r1 = _re_word_boundary(r1)
+            r1 = re_word_boundary(r1)
            r = re.compile(r1, flags=re.IGNORECASE)
            regex_cache[(display_name, False, True)] = r

@ -212,7 +213,7 @@ def _glob_matches(glob: str, value: str, word_boundary: bool = False) -> bool:
    try:
        r = regex_cache.get((glob, True, word_boundary), None)
        if not r:
-            r = _glob_to_re(glob, word_boundary)
+            r = glob_to_regex(glob, word_boundary)
            regex_cache[(glob, True, word_boundary)] = r
        return bool(r.search(value))
    except re.error:
@ -220,56 +221,6 @@ def _glob_matches(glob: str, value: str, word_boundary: bool = False) -> bool:
        return False


-def _glob_to_re(glob: str, word_boundary: bool) -> Pattern:
-    """Generates regex for a given glob.
-
-    Args:
-        glob
-        word_boundary: Whether to match against word boundaries or entire string.
-    """
-    if IS_GLOB.search(glob):
-        r = re.escape(glob)
-
-        r = r.replace(r"\*", ".*?")
-        r = r.replace(r"\?", ".")
-
-        # handle [abc], [a-z] and [!a-z] style ranges.
-        r = GLOB_REGEX.sub(
-            lambda x: (
-                "[%s%s]" % (x.group(1) and "^" or "", x.group(2).replace(r"\\\-", "-"))
-            ),
-            r,
-        )
-        if word_boundary:
-            r = _re_word_boundary(r)
-
-            return re.compile(r, flags=re.IGNORECASE)
-        else:
-            r = "^" + r + "$"
-
-            return re.compile(r, flags=re.IGNORECASE)
-    elif word_boundary:
-        r = re.escape(glob)
-        r = _re_word_boundary(r)
-
-        return re.compile(r, flags=re.IGNORECASE)
-    else:
-        r = "^" + re.escape(glob) + "$"
-        return re.compile(r, flags=re.IGNORECASE)
-
-
-def _re_word_boundary(r: str) -> str:
-    """
-    Adds word boundary characters to the start and end of an
-    expression to require that the match occur as a whole word,
-    but do so respecting the fact that strings starting or ending
-    with non-word characters will change word boundaries.
-    """
-    # we can't use \b as it chokes on unicode. however \W seems to be okay
-    # as shorthand for [^0-9A-Za-z_].
-    return r"(^|\W)%s(\W|$)" % (r,)
-
-
 def _flatten_dict(
    d: Union[EventBase, dict],
    prefix: Optional[List[str]] = None,