Strip overlong OpenGraph data from url preview

... to stop people causing DoSes with malicious web pages
This commit is contained in:
Richard van der Hoff 2019-11-05 15:45:17 +00:00
parent 9ffcf0f7ba
commit e9bfe719ba
3 changed files with 54 additions and 1 deletions

View file

@ -56,6 +56,9 @@ logger = logging.getLogger(__name__)
_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I)
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
OG_TAG_NAME_MAXLEN = 50
OG_TAG_VALUE_MAXLEN = 1000
class PreviewUrlResource(DirectServeResource):
isLeaf = True
@ -167,7 +170,7 @@ class PreviewUrlResource(DirectServeResource):
ts (int):
Returns:
Deferred[str]: json-encoded og data
Deferred[bytes]: json-encoded og data
"""
# check the URL cache in the DB (which will also provide us with
# historical previews, if we have any)
@ -268,6 +271,17 @@ class PreviewUrlResource(DirectServeResource):
logger.warn("Failed to find any OG data in %s", url)
og = {}
# filter out any stupidly long values
keys_to_remove = []
for k, v in og.items():
if len(k) > OG_TAG_NAME_MAXLEN or len(v) > OG_TAG_VALUE_MAXLEN:
logger.warning(
"Pruning overlong tag %s from OG data", k[:OG_TAG_NAME_MAXLEN]
)
keys_to_remove.append(k)
for k in keys_to_remove:
del og[k]
logger.debug("Calculated OG for %s as %s" % (url, og))
jsonog = json.dumps(og)
@ -502,6 +516,10 @@ def _calc_og(tree, media_uri):
og = {}
for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
if "content" in tag.attrib:
# if we've got more than 50 tags, someone is taking the piss
if len(og) >= 50:
logger.warning("skipping OG for page with too many og: tags")
return {}
og[tag.attrib["property"]] = tag.attrib["content"]
# TODO: grab article: meta tags too, e.g.: