fix cyrillic URL previews by hardcoding all page decoding to UTF-8 for now, rather than relying on lxml's heuristics which seem to get it wrong

This commit is contained in:
Matthew Hodgson 2016-04-15 13:19:57 +01:00
parent 737aee9295
commit 84f9cac4d0

View File

@ -181,17 +181,9 @@ class PreviewUrlResource(BaseMediaResource):
from lxml import html from lxml import html
try: # XXX: always manually try to decode body as utf-8 first, which
tree = html.parse(media_info['filename']) # seems to help with most character encoding woes.
og = yield self._calc_og(tree, media_info, requester) # XXX: handle non-utf-8 encodings?
except UnicodeDecodeError:
# XXX: evil evil bodge
# Empirically, sites like google.com mix Latin-1 and utf-8
# encodings in the same page. The rogue Latin-1 characters
# cause lxml to choke with a UnicodeDecodeError, so if we
# see this we go and do a manual decode of the HTML before
# handing it to lxml as utf-8 encoding, counter-intuitively,
# which seems to make it happier...
file = open(media_info['filename']) file = open(media_info['filename'])
body = file.read() body = file.read()
file.close() file.close()