explicitly pass in the charset from Content-Type to lxml to fix cyrillic woes better

This commit is contained in:
Matthew Hodgson 2016-04-15 14:32:25 +01:00
parent 84f9cac4d0
commit aaabbd3e9e

View File

@ -179,15 +179,27 @@ class PreviewUrlResource(BaseMediaResource):
elif self._is_html(media_info['media_type']):
# TODO: somehow stop a big HTML tree from exploding synapse's RAM
from lxml import html
from lxml import etree
# XXX: always manually try to decode body as utf-8 first, which
# seems to help with most character encoding woes.
# XXX: handle non-utf-8 encodings?
file = open(media_info['filename'])
body = file.read()
file.close()
tree = html.fromstring(body.decode('utf-8', 'ignore'))
# clobber the encoding from the content-type, or default to utf-8
# XXX: this overrides any <meta/> or XML charset headers in the body
# which may pose problems, but so far seems to work okay.
match = re.match(r'.*; *charset=(.*?)(;|$)', media_info['media_type'], re.I)
encoding = match.group(1) if match else "utf-8"
try:
parser = etree.HTMLParser(recover=True, encoding=encoding)
tree = etree.fromstring(body, parser)
og = yield self._calc_og(tree, media_info, requester)
except UnicodeDecodeError:
# blindly try decoding the body as utf-8, which seems to fix
# the charset mismatches on https://google.com
parser = etree.HTMLParser(recover=True, encoding=encoding)
tree = etree.fromstring(body.decode('utf-8', 'ignore'), parser)
og = yield self._calc_og(tree, media_info, requester)
else: