Revert "Move glob_to_regex and re_word_boundary to matrix-python-common (#11505) (#11527)

This reverts commit a77c369897.
This commit is contained in:
Sean Quah 2021-12-07 13:51:11 +00:00 committed by GitHub
parent 14d593f72d
commit 088d748f2c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 124 additions and 12 deletions

View file

@ -14,8 +14,9 @@
import json
import logging
import re
import typing
from typing import Any, Callable, Dict, Generator, Optional
from typing import Any, Callable, Dict, Generator, Optional, Pattern
import attr
from frozendict import frozendict
@ -34,6 +35,9 @@ if typing.TYPE_CHECKING:
logger = logging.getLogger(__name__)
_WILDCARD_RUN = re.compile(r"([\?\*]+)")
def _reject_invalid_json(val: Any) -> None:
"""Do not allow Infinity, -Infinity, or NaN values in JSON."""
raise ValueError("Invalid JSON value: '%s'" % val)
@ -181,3 +185,56 @@ def log_failure(
if not consumeErrors:
return failure
return None
def glob_to_regex(glob: str, word_boundary: bool = False) -> Pattern:
"""Converts a glob to a compiled regex object.
Args:
glob: pattern to match
word_boundary: If True, the pattern will be allowed to match at word boundaries
anywhere in the string. Otherwise, the pattern is anchored at the start and
end of the string.
Returns:
compiled regex pattern
"""
# Patterns with wildcards must be simplified to avoid performance cliffs
# - The glob `?**?**?` is equivalent to the glob `???*`
# - The glob `???*` is equivalent to the regex `.{3,}`
chunks = []
for chunk in _WILDCARD_RUN.split(glob):
# No wildcards? re.escape()
if not _WILDCARD_RUN.match(chunk):
chunks.append(re.escape(chunk))
continue
# Wildcards? Simplify.
qmarks = chunk.count("?")
if "*" in chunk:
chunks.append(".{%d,}" % qmarks)
else:
chunks.append(".{%d}" % qmarks)
res = "".join(chunks)
if word_boundary:
res = re_word_boundary(res)
else:
# \A anchors at start of string, \Z at end of string
res = r"\A" + res + r"\Z"
return re.compile(res, re.IGNORECASE)
def re_word_boundary(r: str) -> str:
"""
Adds word boundary characters to the start and end of an
expression to require that the match occur as a whole word,
but do so respecting the fact that strings starting or ending
with non-word characters will change word boundaries.
"""
# we can't use \b as it chokes on unicode. however \W seems to be okay
# as shorthand for [^0-9A-Za-z_].
return r"(^|\W)%s(\W|$)" % (r,)