From 29c353c5536520cd149e1aacf6bd42c7c3f8f4e7 Mon Sep 17 00:00:00 2001
From: Erik Johnston <erik@matrix.org>
Date: Mon, 18 Jan 2016 16:48:17 +0000
Subject: [PATCH] Don't split at word boundaries, actually use regex

---
 synapse/push/bulk_push_rule_evaluator.py |   2 +-
 synapse/push/push_rule_evaluator.py      | 105 ++++++++++-------------
 2 files changed, 46 insertions(+), 61 deletions(-)

diff --git a/synapse/push/bulk_push_rule_evaluator.py b/synapse/push/bulk_push_rule_evaluator.py
index f1910f7da..b0b3a38db 100644
--- a/synapse/push/bulk_push_rule_evaluator.py
+++ b/synapse/push/bulk_push_rule_evaluator.py
@@ -81,7 +81,7 @@ class BulkPushRuleEvaluator:
             users_dict.items(), [event]
         )
 
-        evaluator = PushRuleEvaluatorForEvent.create(event, len(self.users_in_room))
+        evaluator = PushRuleEvaluatorForEvent(event, len(self.users_in_room))
 
         condition_cache = {}
 
diff --git a/synapse/push/push_rule_evaluator.py b/synapse/push/push_rule_evaluator.py
index 0816b632b..78d4b564d 100644
--- a/synapse/push/push_rule_evaluator.py
+++ b/synapse/push/push_rule_evaluator.py
@@ -127,7 +127,7 @@ class PushRuleEvaluator:
         room_members = yield self.store.get_users_in_room(room_id)
         room_member_count = len(room_members)
 
-        evaluator = PushRuleEvaluatorForEvent.create(ev, room_member_count)
+        evaluator = PushRuleEvaluatorForEvent(ev, room_member_count)
 
         for r in self.rules:
             if self.enabled_map.get(r['rule_id'], None) is False:
@@ -180,33 +180,13 @@ class PushRuleEvaluator:
 
 
 class PushRuleEvaluatorForEvent(object):
-    WORD_BOUNDARY = re.compile(r'\b')
-
-    def __init__(self, event, body_parts, room_member_count):
+    def __init__(self, event, room_member_count):
         self._event = event
-
-        # This is a list of words of the content.body (if event has one). Each
-        # word has been converted to lower case.
-        self._body_parts = body_parts
-
         self._room_member_count = room_member_count
 
         # Maps strings of e.g. 'content.body' -> event["content"]["body"]
         self._value_cache = _flatten_dict(event)
 
-    @staticmethod
-    def create(event, room_member_count):
-        body = event.get("content", {}).get("body", None)
-        if body:
-            body_parts = PushRuleEvaluatorForEvent.WORD_BOUNDARY.split(body)
-            body_parts[:] = [
-                part.lower() for part in body_parts
-            ]
-        else:
-            body_parts = []
-
-        return PushRuleEvaluatorForEvent(event, body_parts, room_member_count)
-
     def matches(self, condition, user_id, display_name, profile_tag):
         if condition['kind'] == 'event_match':
             return self._event_match(condition, user_id)
@@ -239,67 +219,72 @@ class PushRuleEvaluatorForEvent(object):
 
         # XXX: optimisation: cache our pattern regexps
         if condition['key'] == 'content.body':
-            matcher = _glob_to_matcher(pattern)
+            body = self._event["content"].get("body", None)
+            if not body:
+                return False
 
-            for part in self._body_parts:
-                if matcher(part):
-                    return True
-            return False
+            return _glob_matches(pattern, body, word_boundary=True)
         else:
             haystack = self._get_value(condition['key'])
             if haystack is None:
                 return False
 
-            matcher = _glob_to_matcher(pattern)
-
-            return matcher(haystack.lower())
+            return _glob_matches(pattern, haystack)
 
     def _contains_display_name(self, display_name):
         if not display_name:
             return False
 
-        lower_display_name = display_name.lower()
-        for part in self._body_parts:
-            if part == lower_display_name:
-                return True
+        body = self._event["content"].get("body", None)
+        if not body:
+            return False
 
-        return False
+        return _glob_matches(display_name, body, word_boundary=True)
 
     def _get_value(self, dotted_key):
         return self._value_cache.get(dotted_key, None)
 
 
-def _glob_to_matcher(glob):
-    """Takes a glob and returns a `func(string) -> bool`, which returns if the
-    string matches the glob. Assumes given string is lower case.
+def _glob_matches(glob, value, word_boundary=False):
+    """Tests if value matches glob.
 
-    The matcher returned is either a simple string comparison for globs without
-    wildcards, or a regex matcher for globs with wildcards.
+    Args:
+        glob (string)
+        value (string): String to test against glob.
+        word_boundary (bool): Whether to match against word boundaries or entire
+            string. Defaults to False.
+
+    Returns:
+        bool
     """
-    glob = glob.lower()
+    if IS_GLOB.search(glob):
+        r = re.escape(glob)
 
-    if not IS_GLOB.search(glob):
-        return lambda value: value == glob
+        r = r.replace(r'\*', '.*?')
+        r = r.replace(r'\?', '.')
 
-    r = re.escape(glob)
+        # handle [abc], [a-z] and [!a-z] style ranges.
+        r = GLOB_REGEX.sub(
+            lambda x: (
+                '[%s%s]' % (
+                    x.group(1) and '^' or '',
+                    x.group(2).replace(r'\\\-', '-')
+                )
+            ),
+            r,
+        )
+        r = r + "$"
+        r = re.compile(r, flags=re.IGNORECASE)
 
-    r = r.replace(r'\*', '.*?')
-    r = r.replace(r'\?', '.')
+        return r.match(value)
+    elif word_boundary:
+        r = re.escape(glob)
+        r = "\b%s\b" % (r,)
+        r = re.compile(r, flags=re.IGNORECASE)
 
-    # handle [abc], [a-z] and [!a-z] style ranges.
-    r = GLOB_REGEX.sub(
-        lambda x: (
-            '[%s%s]' % (
-                x.group(1) and '^' or '',
-                x.group(2).replace(r'\\\-', '-')
-            )
-        ),
-        r,
-    )
-
-    r = r + "$"
-    r = re.compile(r)
-    return lambda value: r.match(value)
+        return r.search(value)
+    else:
+        return value.lower() == glob.lower()
 
 
 def _flatten_dict(d, prefix=[], result={}):