Use <meta> tags to discover the per-page encoding of html previews (#4183)

2025-12-13 10:39:34 -05:00 · 2018-11-15 11:05:08 -06:00 · 2018-11-15 11:05:08 -06:00 · df758e155d
commit df758e155d
parent a51288e5d6
3 changed files with 100 additions and 9 deletions
--- a/changelog.d/4183.bugfix
+++ b/changelog.d/4183.bugfix
@ -0,0 +1 @@
 URL previews now correctly decode non-UTF-8 text if the header contains a `<meta http-equiv="Content-Type"` header.
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@ -53,6 +53,9 @@ from ._base import FileInfo
 logger = logging.getLogger(__name__)
 _charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I)
 _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
 class PreviewUrlResource(Resource):
    isLeaf = True
@ -223,13 +226,23 @@ class PreviewUrlResource(Resource):
            with open(media_info['filename'], 'rb') as file:
                body = file.read()
-            # clobber the encoding from the content-type, or default to utf-8
+            encoding = None
-            # XXX: this overrides any <meta/> or XML charset headers in the body
+
-            # which may pose problems, but so far seems to work okay.
+            # Let's try and figure out if it has an encoding set in a meta tag.
-            match = re.match(
+            # Limit it to the first 1kb, since it ought to be in the meta tags
-                r'.*; *charset="?(.*?)"?(;|$)',
+            # at the top.
-                media_info['media_type'],
+            match = _charset_match.search(body[:1000])
-                re.I
+
            # If we find a match, it should take precedence over the
            # Content-Type header, so set it here.
            if match:
                encoding = match.group(1).decode('ascii')
            # If we don't find a match, we'll look at the HTTP Content-Type, and
            # if that doesn't exist, we'll fall back to UTF-8.
            if not encoding:
                match = _content_type_match.match(
                    media_info['media_type']
                )
                encoding = match.group(1) if match else "utf-8"
--- a/tests/rest/media/v1/test_url_preview.py
+++ b/tests/rest/media/v1/test_url_preview.py
@ -162,3 +162,80 @@ class URLPreviewTests(unittest.HomeserverTestCase):
        self.assertEqual(
            channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
        )
    def test_non_ascii_preview_httpequiv(self):
        request, channel = self.make_request(
            "GET", "url_preview?url=matrix.org", shorthand=False
        )
        request.render(self.preview_url)
        self.pump()
        # We've made one fetch
        self.assertEqual(len(self.fetches), 1)
        end_content = (
            b'<html><head>'
            b'<meta http-equiv="Content-Type" content="text/html; charset=windows-1251"/>'
            b'<meta property="og:title" content="\xe4\xea\xe0" />'
            b'<meta property="og:description" content="hi" />'
            b'</head></html>'
        )
        self.fetches[0][0].callback(
            (
                end_content,
                (
                    len(end_content),
                    {
                        b"Content-Length": [b"%d" % (len(end_content))],
                        # This charset=utf-8 should be ignored, because the
                        # document has a meta tag overriding it.
                        b"Content-Type": [b'text/html; charset="utf8"'],
                    },
                    "https://example.com",
                    200,
                ),
            )
        )
        self.pump()
        self.assertEqual(channel.code, 200)
        self.assertEqual(channel.json_body["og:title"], u"\u0434\u043a\u0430")
    def test_non_ascii_preview_content_type(self):
        request, channel = self.make_request(
            "GET", "url_preview?url=matrix.org", shorthand=False
        )
        request.render(self.preview_url)
        self.pump()
        # We've made one fetch
        self.assertEqual(len(self.fetches), 1)
        end_content = (
            b'<html><head>'
            b'<meta property="og:title" content="\xe4\xea\xe0" />'
            b'<meta property="og:description" content="hi" />'
            b'</head></html>'
        )
        self.fetches[0][0].callback(
            (
                end_content,
                (
                    len(end_content),
                    {
                        b"Content-Length": [b"%d" % (len(end_content))],
                        b"Content-Type": [b'text/html; charset="windows-1251"'],
                    },
                    "https://example.com",
                    200,
                ),
            )
        )
        self.pump()
        self.assertEqual(channel.code, 200)
        self.assertEqual(channel.json_body["og:title"], u"\u0434\u043a\u0430")
		`@ -0,0 +1 @@`
							URL previews now correctly decode non-UTF-8 text if the header contains a `<meta http-equiv="Content-Type"` header.