mirror of
https://git.anonymousland.org/anonymousland/synapse.git
synced 2025-05-03 15:26:10 -04:00
Handle additional errors when previewing URLs. (#9333)
* Handle the case of lxml not finding a document tree. * Parse the document encoding from the XML tag.
This commit is contained in:
parent
b0b2cac057
commit
0963d39ea6
3 changed files with 145 additions and 30 deletions
|
@ -15,6 +15,7 @@
|
|||
|
||||
from synapse.rest.media.v1.preview_url_resource import (
|
||||
decode_and_calc_og,
|
||||
get_html_media_encoding,
|
||||
summarize_paragraphs,
|
||||
)
|
||||
|
||||
|
@ -26,7 +27,7 @@ except ImportError:
|
|||
lxml = None
|
||||
|
||||
|
||||
class PreviewTestCase(unittest.TestCase):
|
||||
class SummarizeTestCase(unittest.TestCase):
|
||||
if not lxml:
|
||||
skip = "url preview feature requires lxml"
|
||||
|
||||
|
@ -144,12 +145,12 @@ class PreviewTestCase(unittest.TestCase):
|
|||
)
|
||||
|
||||
|
||||
class PreviewUrlTestCase(unittest.TestCase):
|
||||
class CalcOgTestCase(unittest.TestCase):
|
||||
if not lxml:
|
||||
skip = "url preview feature requires lxml"
|
||||
|
||||
def test_simple(self):
|
||||
html = """
|
||||
html = b"""
|
||||
<html>
|
||||
<head><title>Foo</title></head>
|
||||
<body>
|
||||
|
@ -163,7 +164,7 @@ class PreviewUrlTestCase(unittest.TestCase):
|
|||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||
|
||||
def test_comment(self):
|
||||
html = """
|
||||
html = b"""
|
||||
<html>
|
||||
<head><title>Foo</title></head>
|
||||
<body>
|
||||
|
@ -178,7 +179,7 @@ class PreviewUrlTestCase(unittest.TestCase):
|
|||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||
|
||||
def test_comment2(self):
|
||||
html = """
|
||||
html = b"""
|
||||
<html>
|
||||
<head><title>Foo</title></head>
|
||||
<body>
|
||||
|
@ -202,7 +203,7 @@ class PreviewUrlTestCase(unittest.TestCase):
|
|||
)
|
||||
|
||||
def test_script(self):
|
||||
html = """
|
||||
html = b"""
|
||||
<html>
|
||||
<head><title>Foo</title></head>
|
||||
<body>
|
||||
|
@ -217,7 +218,7 @@ class PreviewUrlTestCase(unittest.TestCase):
|
|||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||
|
||||
def test_missing_title(self):
|
||||
html = """
|
||||
html = b"""
|
||||
<html>
|
||||
<body>
|
||||
Some text.
|
||||
|
@ -230,7 +231,7 @@ class PreviewUrlTestCase(unittest.TestCase):
|
|||
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
|
||||
|
||||
def test_h1_as_title(self):
|
||||
html = """
|
||||
html = b"""
|
||||
<html>
|
||||
<meta property="og:description" content="Some text."/>
|
||||
<body>
|
||||
|
@ -244,7 +245,7 @@ class PreviewUrlTestCase(unittest.TestCase):
|
|||
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
|
||||
|
||||
def test_missing_title_and_broken_h1(self):
|
||||
html = """
|
||||
html = b"""
|
||||
<html>
|
||||
<body>
|
||||
<h1><a href="foo"/></h1>
|
||||
|
@ -258,13 +259,20 @@ class PreviewUrlTestCase(unittest.TestCase):
|
|||
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
|
||||
|
||||
def test_empty(self):
|
||||
html = ""
|
||||
"""Test a body with no data in it."""
|
||||
html = b""
|
||||
og = decode_and_calc_og(html, "http://example.com/test.html")
|
||||
self.assertEqual(og, {})
|
||||
|
||||
def test_no_tree(self):
|
||||
"""A valid body with no tree in it."""
|
||||
html = b"\x00"
|
||||
og = decode_and_calc_og(html, "http://example.com/test.html")
|
||||
self.assertEqual(og, {})
|
||||
|
||||
def test_invalid_encoding(self):
|
||||
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
|
||||
html = """
|
||||
html = b"""
|
||||
<html>
|
||||
<head><title>Foo</title></head>
|
||||
<body>
|
||||
|
@ -290,3 +298,76 @@ class PreviewUrlTestCase(unittest.TestCase):
|
|||
"""
|
||||
og = decode_and_calc_og(html, "http://example.com/test.html")
|
||||
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
|
||||
|
||||
|
||||
class MediaEncodingTestCase(unittest.TestCase):
|
||||
def test_meta_charset(self):
|
||||
"""A character encoding is found via the meta tag."""
|
||||
encoding = get_html_media_encoding(
|
||||
b"""
|
||||
<html>
|
||||
<head><meta charset="ascii">
|
||||
</head>
|
||||
</html>
|
||||
""",
|
||||
"text/html",
|
||||
)
|
||||
self.assertEqual(encoding, "ascii")
|
||||
|
||||
# A less well-formed version.
|
||||
encoding = get_html_media_encoding(
|
||||
b"""
|
||||
<html>
|
||||
<head>< meta charset = ascii>
|
||||
</head>
|
||||
</html>
|
||||
""",
|
||||
"text/html",
|
||||
)
|
||||
self.assertEqual(encoding, "ascii")
|
||||
|
||||
def test_xml_encoding(self):
|
||||
"""A character encoding is found via the meta tag."""
|
||||
encoding = get_html_media_encoding(
|
||||
b"""
|
||||
<?xml version="1.0" encoding="ascii"?>
|
||||
<html>
|
||||
</html>
|
||||
""",
|
||||
"text/html",
|
||||
)
|
||||
self.assertEqual(encoding, "ascii")
|
||||
|
||||
def test_meta_xml_encoding(self):
|
||||
"""Meta tags take precedence over XML encoding."""
|
||||
encoding = get_html_media_encoding(
|
||||
b"""
|
||||
<?xml version="1.0" encoding="ascii"?>
|
||||
<html>
|
||||
<head><meta charset="UTF-16">
|
||||
</head>
|
||||
</html>
|
||||
""",
|
||||
"text/html",
|
||||
)
|
||||
self.assertEqual(encoding, "UTF-16")
|
||||
|
||||
def test_content_type(self):
|
||||
"""A character encoding is found via the Content-Type header."""
|
||||
# Test a few variations of the header.
|
||||
headers = (
|
||||
'text/html; charset="ascii";',
|
||||
"text/html;charset=ascii;",
|
||||
'text/html; charset="ascii"',
|
||||
"text/html; charset=ascii",
|
||||
'text/html; charset="ascii;',
|
||||
'text/html; charset=ascii";',
|
||||
)
|
||||
for header in headers:
|
||||
encoding = get_html_media_encoding(b"", header)
|
||||
self.assertEqual(encoding, "ascii")
|
||||
|
||||
def test_fallback(self):
|
||||
"""A character encoding cannot be found in the body or header."""
|
||||
encoding = get_html_media_encoding(b"", "text/html")
|
||||
self.assertEqual(encoding, "utf-8")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue