Don't split at word boundaries, actually use regex

This commit is contained in:
Erik Johnston 2016-01-18 16:48:17 +00:00
parent d16dcf642e
commit 29c353c553
2 changed files with 46 additions and 61 deletions

View File

@ -81,7 +81,7 @@ class BulkPushRuleEvaluator:
users_dict.items(), [event] users_dict.items(), [event]
) )
evaluator = PushRuleEvaluatorForEvent.create(event, len(self.users_in_room)) evaluator = PushRuleEvaluatorForEvent(event, len(self.users_in_room))
condition_cache = {} condition_cache = {}

View File

@ -127,7 +127,7 @@ class PushRuleEvaluator:
room_members = yield self.store.get_users_in_room(room_id) room_members = yield self.store.get_users_in_room(room_id)
room_member_count = len(room_members) room_member_count = len(room_members)
evaluator = PushRuleEvaluatorForEvent.create(ev, room_member_count) evaluator = PushRuleEvaluatorForEvent(ev, room_member_count)
for r in self.rules: for r in self.rules:
if self.enabled_map.get(r['rule_id'], None) is False: if self.enabled_map.get(r['rule_id'], None) is False:
@ -180,33 +180,13 @@ class PushRuleEvaluator:
class PushRuleEvaluatorForEvent(object): class PushRuleEvaluatorForEvent(object):
WORD_BOUNDARY = re.compile(r'\b') def __init__(self, event, room_member_count):
def __init__(self, event, body_parts, room_member_count):
self._event = event self._event = event
# This is a list of words of the content.body (if event has one). Each
# word has been converted to lower case.
self._body_parts = body_parts
self._room_member_count = room_member_count self._room_member_count = room_member_count
# Maps strings of e.g. 'content.body' -> event["content"]["body"] # Maps strings of e.g. 'content.body' -> event["content"]["body"]
self._value_cache = _flatten_dict(event) self._value_cache = _flatten_dict(event)
@staticmethod
def create(event, room_member_count):
body = event.get("content", {}).get("body", None)
if body:
body_parts = PushRuleEvaluatorForEvent.WORD_BOUNDARY.split(body)
body_parts[:] = [
part.lower() for part in body_parts
]
else:
body_parts = []
return PushRuleEvaluatorForEvent(event, body_parts, room_member_count)
def matches(self, condition, user_id, display_name, profile_tag): def matches(self, condition, user_id, display_name, profile_tag):
if condition['kind'] == 'event_match': if condition['kind'] == 'event_match':
return self._event_match(condition, user_id) return self._event_match(condition, user_id)
@ -239,67 +219,72 @@ class PushRuleEvaluatorForEvent(object):
# XXX: optimisation: cache our pattern regexps # XXX: optimisation: cache our pattern regexps
if condition['key'] == 'content.body': if condition['key'] == 'content.body':
matcher = _glob_to_matcher(pattern) body = self._event["content"].get("body", None)
if not body:
return False
for part in self._body_parts: return _glob_matches(pattern, body, word_boundary=True)
if matcher(part):
return True
return False
else: else:
haystack = self._get_value(condition['key']) haystack = self._get_value(condition['key'])
if haystack is None: if haystack is None:
return False return False
matcher = _glob_to_matcher(pattern) return _glob_matches(pattern, haystack)
return matcher(haystack.lower())
def _contains_display_name(self, display_name): def _contains_display_name(self, display_name):
if not display_name: if not display_name:
return False return False
lower_display_name = display_name.lower() body = self._event["content"].get("body", None)
for part in self._body_parts: if not body:
if part == lower_display_name: return False
return True
return False return _glob_matches(display_name, body, word_boundary=True)
def _get_value(self, dotted_key): def _get_value(self, dotted_key):
return self._value_cache.get(dotted_key, None) return self._value_cache.get(dotted_key, None)
def _glob_to_matcher(glob): def _glob_matches(glob, value, word_boundary=False):
"""Takes a glob and returns a `func(string) -> bool`, which returns if the """Tests if value matches glob.
string matches the glob. Assumes given string is lower case.
The matcher returned is either a simple string comparison for globs without Args:
wildcards, or a regex matcher for globs with wildcards. glob (string)
value (string): String to test against glob.
word_boundary (bool): Whether to match against word boundaries or entire
string. Defaults to False.
Returns:
bool
""" """
glob = glob.lower() if IS_GLOB.search(glob):
r = re.escape(glob)
if not IS_GLOB.search(glob): r = r.replace(r'\*', '.*?')
return lambda value: value == glob r = r.replace(r'\?', '.')
r = re.escape(glob) # handle [abc], [a-z] and [!a-z] style ranges.
r = GLOB_REGEX.sub(
lambda x: (
'[%s%s]' % (
x.group(1) and '^' or '',
x.group(2).replace(r'\\\-', '-')
)
),
r,
)
r = r + "$"
r = re.compile(r, flags=re.IGNORECASE)
r = r.replace(r'\*', '.*?') return r.match(value)
r = r.replace(r'\?', '.') elif word_boundary:
r = re.escape(glob)
r = "\b%s\b" % (r,)
r = re.compile(r, flags=re.IGNORECASE)
# handle [abc], [a-z] and [!a-z] style ranges. return r.search(value)
r = GLOB_REGEX.sub( else:
lambda x: ( return value.lower() == glob.lower()
'[%s%s]' % (
x.group(1) and '^' or '',
x.group(2).replace(r'\\\-', '-')
)
),
r,
)
r = r + "$"
r = re.compile(r)
return lambda value: r.match(value)
def _flatten_dict(d, prefix=[], result={}): def _flatten_dict(d, prefix=[], result={}):