From 6748f0a57962fb9657cab60083d94b4c97a0526c Mon Sep 17 00:00:00 2001 From: David Baker Date: Thu, 5 Oct 2017 11:33:30 +0100 Subject: [PATCH 1/3] Fix notif kws that start/end with non-word chars Only prepend / append word bounary characters if the search expression starts or ends with a word character, otherwise they don't work because there's no word bounary between whitespace and a non-word char. --- synapse/push/push_rule_evaluator.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/synapse/push/push_rule_evaluator.py b/synapse/push/push_rule_evaluator.py index 172c27c13..5a34d60ab 100644 --- a/synapse/push/push_rule_evaluator.py +++ b/synapse/push/push_rule_evaluator.py @@ -26,6 +26,8 @@ logger = logging.getLogger(__name__) GLOB_REGEX = re.compile(r'\\\[(\\\!|)(.*)\\\]') IS_GLOB = re.compile(r'[\?\*\[\]]') INEQUALITY_EXPR = re.compile("^([=<>]*)([0-9]*)$") +STARTS_WITH_WORD_CHAR_REGEX = re.compile(r"^\w") +ENDS_WITH_WORD_CHAR_REGEX = re.compile(r"\w$") def _room_member_count(ev, condition, room_member_count): @@ -183,7 +185,7 @@ def _glob_to_re(glob, word_boundary): r, ) if word_boundary: - r = r"\b%s\b" % (r,) + r = _re_word_boundary(r) return re.compile(r, flags=re.IGNORECASE) else: @@ -192,13 +194,30 @@ def _glob_to_re(glob, word_boundary): return re.compile(r, flags=re.IGNORECASE) elif word_boundary: r = re.escape(glob) - r = r"\b%s\b" % (r,) + r = _re_word_boundary(r) return re.compile(r, flags=re.IGNORECASE) else: r = "^" + re.escape(glob) + "$" return re.compile(r, flags=re.IGNORECASE) +def _re_word_boundary(r): + """ + Adds word boundary characters to the start and end of an + expression to require that the match occur as a whole word, + but do so respecting the fact that strings starting or ending + with non-word characters will change word boundaries. + """ + # Matching a regex string aginst a regex, since by definition + # \b is the boundary between a \w and a \W, so match \w at the + # start or end of the expression (although this will miss, eg. + # "[dl]og") + if STARTS_WITH_WORD_CHAR_REGEX.search(r): + r = r"\b%s" % (r,) + if ENDS_WITH_WORD_CHAR_REGEX.search(r): + r = r"%s\b" % (r,) + return r + def _flatten_dict(d, prefix=[], result=None): if result is None: From cbe3c3fdd49b87a452a9a9a229abfdf8dbe45922 Mon Sep 17 00:00:00 2001 From: David Baker Date: Thu, 5 Oct 2017 11:43:10 +0100 Subject: [PATCH 2/3] pep8 --- synapse/push/push_rule_evaluator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/synapse/push/push_rule_evaluator.py b/synapse/push/push_rule_evaluator.py index 5a34d60ab..b78f2d90d 100644 --- a/synapse/push/push_rule_evaluator.py +++ b/synapse/push/push_rule_evaluator.py @@ -201,6 +201,7 @@ def _glob_to_re(glob, word_boundary): r = "^" + re.escape(glob) + "$" return re.compile(r, flags=re.IGNORECASE) + def _re_word_boundary(r): """ Adds word boundary characters to the start and end of an From 0c8da8b519fbd8bca984117e354fe57c3a76e154 Mon Sep 17 00:00:00 2001 From: David Baker Date: Thu, 5 Oct 2017 11:57:43 +0100 Subject: [PATCH 3/3] Use better method for word boundary searching From https://github.com/matrix-org/matrix-js-sdk/commit/ebc95667b8a5777d13e5d3c679972bedae022fd5 --- synapse/push/push_rule_evaluator.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/synapse/push/push_rule_evaluator.py b/synapse/push/push_rule_evaluator.py index b78f2d90d..65f9a63fd 100644 --- a/synapse/push/push_rule_evaluator.py +++ b/synapse/push/push_rule_evaluator.py @@ -26,8 +26,6 @@ logger = logging.getLogger(__name__) GLOB_REGEX = re.compile(r'\\\[(\\\!|)(.*)\\\]') IS_GLOB = re.compile(r'[\?\*\[\]]') INEQUALITY_EXPR = re.compile("^([=<>]*)([0-9]*)$") -STARTS_WITH_WORD_CHAR_REGEX = re.compile(r"^\w") -ENDS_WITH_WORD_CHAR_REGEX = re.compile(r"\w$") def _room_member_count(ev, condition, room_member_count): @@ -209,15 +207,9 @@ def _re_word_boundary(r): but do so respecting the fact that strings starting or ending with non-word characters will change word boundaries. """ - # Matching a regex string aginst a regex, since by definition - # \b is the boundary between a \w and a \W, so match \w at the - # start or end of the expression (although this will miss, eg. - # "[dl]og") - if STARTS_WITH_WORD_CHAR_REGEX.search(r): - r = r"\b%s" % (r,) - if ENDS_WITH_WORD_CHAR_REGEX.search(r): - r = r"%s\b" % (r,) - return r + # we can't use \b as it chokes on unicode. however \W seems to be okay + # as shorthand for [^0-9A-Za-z_]. + return r"(^|\W)%s(\W|$)" % (r,) def _flatten_dict(d, prefix=[], result=None):