explicitly pass in the charset from Content-Type to lxml to fix cyrillic woes better

2025-11-28 01:01:20 -05:00 · 2016-04-15 14:32:25 +01:00 · 2016-04-15 14:32:25 +01:00 · aaabbd3e9e
commit aaabbd3e9e
parent 84f9cac4d0
1 changed files with 18 additions and 6 deletions
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@ -179,15 +179,27 @@ class PreviewUrlResource(BaseMediaResource):
        elif self._is_html(media_info['media_type']):
            # TODO: somehow stop a big HTML tree from exploding synapse's RAM

-            from lxml import html
+            from lxml import etree

-            # XXX: always manually try to decode body as utf-8 first, which
-            # seems to help with most character encoding woes.
-            # XXX: handle non-utf-8 encodings?
            file = open(media_info['filename'])
            body = file.read()
            file.close()
-            tree = html.fromstring(body.decode('utf-8', 'ignore'))
+
+            # clobber the encoding from the content-type, or default to utf-8
+            # XXX: this overrides any <meta/> or XML charset headers in the body
+            # which may pose problems, but so far seems to work okay.
+            match = re.match(r'.*; *charset=(.*?)(;|$)', media_info['media_type'], re.I)
+            encoding = match.group(1) if match else "utf-8"
+
+            try:
+                parser = etree.HTMLParser(recover=True, encoding=encoding)
+                tree = etree.fromstring(body, parser)
+                og = yield self._calc_og(tree, media_info, requester)
+            except UnicodeDecodeError:
+                # blindly try decoding the body as utf-8, which seems to fix
+                # the charset mismatches on https://google.com
+                parser = etree.HTMLParser(recover=True, encoding=encoding)
+                tree = etree.fromstring(body.decode('utf-8', 'ignore'), parser)
                og = yield self._calc_og(tree, media_info, requester)

        else: