Try to recover from unknown encodings when previewing media. (#9164)

Treat unknown encodings (according to lxml) as UTF-8 when generating a preview for HTML documents. This isn't fully accurate, but will hopefully give a reasonable title and summary.
2025-08-03 17:44:15 -04:00 · 2021-01-26 07:32:17 -05:00 · 2021-01-26 07:32:17 -05:00 · 4937fe3d6b
commit 4937fe3d6b
parent e74bb96733
3 changed files with 64 additions and 10 deletions
--- a/tests/test_preview.py
+++ b/tests/test_preview.py
@ -261,3 +261,32 @@ class PreviewUrlTestCase(unittest.TestCase):
        html = ""
        og = decode_and_calc_og(html, "http://example.com/test.html")
        self.assertEqual(og, {})
+
+    def test_invalid_encoding(self):
+        """An invalid character encoding should be ignored and treated as UTF-8, if possible."""
+        html = """
+        <html>
+        <head><title>Foo</title></head>
+        <body>
+        Some text.
+        </body>
+        </html>
+        """
+        og = decode_and_calc_og(
+            html, "http://example.com/test.html", "invalid-encoding"
+        )
+        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
+
+    def test_invalid_encoding2(self):
+        """A body which doesn't match the sent character encoding."""
+        # Note that this contains an invalid UTF-8 sequence in the title.
+        html = b"""
+        <html>
+        <head><title>\xff\xff Foo</title></head>
+        <body>
+        Some text.
+        </body>
+        </html>
+        """
+        og = decode_and_calc_og(html, "http://example.com/test.html")
+        self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})