mirror of
https://git.anonymousland.org/anonymousland/synapse.git
synced 2024-10-01 11:49:51 -04:00
explicitly pass in the charset from Content-Type to lxml to fix cyrillic woes better
This commit is contained in:
parent
84f9cac4d0
commit
aaabbd3e9e
@ -179,15 +179,27 @@ class PreviewUrlResource(BaseMediaResource):
|
||||
elif self._is_html(media_info['media_type']):
|
||||
# TODO: somehow stop a big HTML tree from exploding synapse's RAM
|
||||
|
||||
from lxml import html
|
||||
from lxml import etree
|
||||
|
||||
# XXX: always manually try to decode body as utf-8 first, which
|
||||
# seems to help with most character encoding woes.
|
||||
# XXX: handle non-utf-8 encodings?
|
||||
file = open(media_info['filename'])
|
||||
body = file.read()
|
||||
file.close()
|
||||
tree = html.fromstring(body.decode('utf-8', 'ignore'))
|
||||
|
||||
# clobber the encoding from the content-type, or default to utf-8
|
||||
# XXX: this overrides any <meta/> or XML charset headers in the body
|
||||
# which may pose problems, but so far seems to work okay.
|
||||
match = re.match(r'.*; *charset=(.*?)(;|$)', media_info['media_type'], re.I)
|
||||
encoding = match.group(1) if match else "utf-8"
|
||||
|
||||
try:
|
||||
parser = etree.HTMLParser(recover=True, encoding=encoding)
|
||||
tree = etree.fromstring(body, parser)
|
||||
og = yield self._calc_og(tree, media_info, requester)
|
||||
except UnicodeDecodeError:
|
||||
# blindly try decoding the body as utf-8, which seems to fix
|
||||
# the charset mismatches on https://google.com
|
||||
parser = etree.HTMLParser(recover=True, encoding=encoding)
|
||||
tree = etree.fromstring(body.decode('utf-8', 'ignore'), parser)
|
||||
og = yield self._calc_og(tree, media_info, requester)
|
||||
|
||||
else:
|
||||
|
Loading…
Reference in New Issue
Block a user