mirror of
https://git.anonymousland.org/anonymousland/synapse-product.git
synced 2025-08-03 17:44:15 -04:00
Try to recover from unknown encodings when previewing media. (#9164)
Treat unknown encodings (according to lxml) as UTF-8 when generating a preview for HTML documents. This isn't fully accurate, but will hopefully give a reasonable title and summary.
This commit is contained in:
parent
e74bb96733
commit
4937fe3d6b
3 changed files with 64 additions and 10 deletions
|
@ -261,3 +261,32 @@ class PreviewUrlTestCase(unittest.TestCase):
|
|||
html = ""
|
||||
og = decode_and_calc_og(html, "http://example.com/test.html")
|
||||
self.assertEqual(og, {})
|
||||
|
||||
def test_invalid_encoding(self):
|
||||
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
|
||||
html = """
|
||||
<html>
|
||||
<head><title>Foo</title></head>
|
||||
<body>
|
||||
Some text.
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
og = decode_and_calc_og(
|
||||
html, "http://example.com/test.html", "invalid-encoding"
|
||||
)
|
||||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||
|
||||
def test_invalid_encoding2(self):
|
||||
"""A body which doesn't match the sent character encoding."""
|
||||
# Note that this contains an invalid UTF-8 sequence in the title.
|
||||
html = b"""
|
||||
<html>
|
||||
<head><title>\xff\xff Foo</title></head>
|
||||
<body>
|
||||
Some text.
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
og = decode_and_calc_og(html, "http://example.com/test.html")
|
||||
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue