From 0c8da8b519fbd8bca984117e354fe57c3a76e154 Mon Sep 17 00:00:00 2001
From: David Baker <dave@matrix.org>
Date: Thu, 5 Oct 2017 11:57:43 +0100
Subject: [PATCH] Use better method for word boundary searching

From https://github.com/matrix-org/matrix-js-sdk/commit/ebc95667b8a5777d13e5d3c679972bedae022fd5
---
 synapse/push/push_rule_evaluator.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/synapse/push/push_rule_evaluator.py b/synapse/push/push_rule_evaluator.py
index b78f2d90d..65f9a63fd 100644
--- a/synapse/push/push_rule_evaluator.py
+++ b/synapse/push/push_rule_evaluator.py
@@ -26,8 +26,6 @@ logger = logging.getLogger(__name__)
 GLOB_REGEX = re.compile(r'\\\[(\\\!|)(.*)\\\]')
 IS_GLOB = re.compile(r'[\?\*\[\]]')
 INEQUALITY_EXPR = re.compile("^([=<>]*)([0-9]*)$")
-STARTS_WITH_WORD_CHAR_REGEX = re.compile(r"^\w")
-ENDS_WITH_WORD_CHAR_REGEX = re.compile(r"\w$")
 
 
 def _room_member_count(ev, condition, room_member_count):
@@ -209,15 +207,9 @@ def _re_word_boundary(r):
     but do so respecting the fact that strings starting or ending
     with non-word characters will change word boundaries.
     """
-    # Matching a regex string aginst a regex, since by definition
-    # \b is the boundary between a \w and a \W, so match \w at the
-    # start or end of the expression (although this will miss, eg.
-    # "[dl]og")
-    if STARTS_WITH_WORD_CHAR_REGEX.search(r):
-        r = r"\b%s" % (r,)
-    if ENDS_WITH_WORD_CHAR_REGEX.search(r):
-        r = r"%s\b" % (r,)
-    return r
+    # we can't use \b as it chokes on unicode. however \W seems to be okay
+    # as shorthand for [^0-9A-Za-z_].
+    return r"(^|\W)%s(\W|$)" % (r,)
 
 
 def _flatten_dict(d, prefix=[], result=None):