mirror of
https://git.anonymousland.org/anonymousland/synapse.git
synced 2025-11-30 22:56:42 -05:00
Move HTML parsing to a separate file for URL previews. (#11566)
* Splits the logic for parsing HTML from the resource handling code. * Fix a circular import in the oEmbed code (which uses the HTML parsing code). * Renames some of the HTML parsing methods to: * Make it clear which methods are "internal" to the module. * Clarify what the methods do.
This commit is contained in:
parent
5305a5e881
commit
eb39da6782
6 changed files with 432 additions and 401 deletions
|
|
@ -12,10 +12,10 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from synapse.rest.media.v1.preview_url_resource import (
|
||||
_calc_og,
|
||||
from synapse.rest.media.v1.preview_html import (
|
||||
_get_html_media_encodings,
|
||||
decode_body,
|
||||
get_html_media_encodings,
|
||||
parse_html_to_open_graph,
|
||||
summarize_paragraphs,
|
||||
)
|
||||
|
||||
|
|
@ -160,7 +160,7 @@ class CalcOgTestCase(unittest.TestCase):
|
|||
"""
|
||||
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
og = _calc_og(tree, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
|
||||
|
||||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||
|
||||
|
|
@ -176,7 +176,7 @@ class CalcOgTestCase(unittest.TestCase):
|
|||
"""
|
||||
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
og = _calc_og(tree, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
|
||||
|
||||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||
|
||||
|
|
@ -195,7 +195,7 @@ class CalcOgTestCase(unittest.TestCase):
|
|||
"""
|
||||
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
og = _calc_og(tree, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
|
||||
|
||||
self.assertEqual(
|
||||
og,
|
||||
|
|
@ -217,7 +217,7 @@ class CalcOgTestCase(unittest.TestCase):
|
|||
"""
|
||||
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
og = _calc_og(tree, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
|
||||
|
||||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||
|
||||
|
|
@ -231,7 +231,7 @@ class CalcOgTestCase(unittest.TestCase):
|
|||
"""
|
||||
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
og = _calc_og(tree, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
|
||||
|
||||
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
|
||||
|
||||
|
|
@ -246,7 +246,7 @@ class CalcOgTestCase(unittest.TestCase):
|
|||
"""
|
||||
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
og = _calc_og(tree, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
|
||||
|
||||
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
|
||||
|
||||
|
|
@ -261,7 +261,7 @@ class CalcOgTestCase(unittest.TestCase):
|
|||
"""
|
||||
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
og = _calc_og(tree, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
|
||||
|
||||
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
|
||||
|
||||
|
|
@ -289,7 +289,7 @@ class CalcOgTestCase(unittest.TestCase):
|
|||
<head><title>Foo</title></head><body>Some text.</body></html>
|
||||
""".strip()
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
og = _calc_og(tree, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
|
||||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||
|
||||
def test_invalid_encoding(self):
|
||||
|
|
@ -303,7 +303,7 @@ class CalcOgTestCase(unittest.TestCase):
|
|||
</html>
|
||||
"""
|
||||
tree = decode_body(html, "http://example.com/test.html", "invalid-encoding")
|
||||
og = _calc_og(tree, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
|
||||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||
|
||||
def test_invalid_encoding2(self):
|
||||
|
|
@ -318,7 +318,7 @@ class CalcOgTestCase(unittest.TestCase):
|
|||
</html>
|
||||
"""
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
og = _calc_og(tree, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
|
||||
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
|
||||
|
||||
def test_windows_1252(self):
|
||||
|
|
@ -332,14 +332,14 @@ class CalcOgTestCase(unittest.TestCase):
|
|||
</html>
|
||||
"""
|
||||
tree = decode_body(html, "http://example.com/test.html")
|
||||
og = _calc_og(tree, "http://example.com/test.html")
|
||||
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
|
||||
self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
|
||||
|
||||
|
||||
class MediaEncodingTestCase(unittest.TestCase):
|
||||
def test_meta_charset(self):
|
||||
"""A character encoding is found via the meta tag."""
|
||||
encodings = get_html_media_encodings(
|
||||
encodings = _get_html_media_encodings(
|
||||
b"""
|
||||
<html>
|
||||
<head><meta charset="ascii">
|
||||
|
|
@ -351,7 +351,7 @@ class MediaEncodingTestCase(unittest.TestCase):
|
|||
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
|
||||
|
||||
# A less well-formed version.
|
||||
encodings = get_html_media_encodings(
|
||||
encodings = _get_html_media_encodings(
|
||||
b"""
|
||||
<html>
|
||||
<head>< meta charset = ascii>
|
||||
|
|
@ -364,7 +364,7 @@ class MediaEncodingTestCase(unittest.TestCase):
|
|||
|
||||
def test_meta_charset_underscores(self):
|
||||
"""A character encoding contains underscore."""
|
||||
encodings = get_html_media_encodings(
|
||||
encodings = _get_html_media_encodings(
|
||||
b"""
|
||||
<html>
|
||||
<head><meta charset="Shift_JIS">
|
||||
|
|
@ -377,7 +377,7 @@ class MediaEncodingTestCase(unittest.TestCase):
|
|||
|
||||
def test_xml_encoding(self):
|
||||
"""A character encoding is found via the meta tag."""
|
||||
encodings = get_html_media_encodings(
|
||||
encodings = _get_html_media_encodings(
|
||||
b"""
|
||||
<?xml version="1.0" encoding="ascii"?>
|
||||
<html>
|
||||
|
|
@ -389,7 +389,7 @@ class MediaEncodingTestCase(unittest.TestCase):
|
|||
|
||||
def test_meta_xml_encoding(self):
|
||||
"""Meta tags take precedence over XML encoding."""
|
||||
encodings = get_html_media_encodings(
|
||||
encodings = _get_html_media_encodings(
|
||||
b"""
|
||||
<?xml version="1.0" encoding="ascii"?>
|
||||
<html>
|
||||
|
|
@ -413,17 +413,17 @@ class MediaEncodingTestCase(unittest.TestCase):
|
|||
'text/html; charset=ascii";',
|
||||
)
|
||||
for header in headers:
|
||||
encodings = get_html_media_encodings(b"", header)
|
||||
encodings = _get_html_media_encodings(b"", header)
|
||||
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
|
||||
|
||||
def test_fallback(self):
|
||||
"""A character encoding cannot be found in the body or header."""
|
||||
encodings = get_html_media_encodings(b"", "text/html")
|
||||
encodings = _get_html_media_encodings(b"", "text/html")
|
||||
self.assertEqual(list(encodings), ["utf-8", "cp1252"])
|
||||
|
||||
def test_duplicates(self):
|
||||
"""Ensure each encoding is only attempted once."""
|
||||
encodings = get_html_media_encodings(
|
||||
encodings = _get_html_media_encodings(
|
||||
b"""
|
||||
<?xml version="1.0" encoding="utf8"?>
|
||||
<html>
|
||||
|
|
@ -437,7 +437,7 @@ class MediaEncodingTestCase(unittest.TestCase):
|
|||
|
||||
def test_unknown_invalid(self):
|
||||
"""A character encoding should be ignored if it is unknown or invalid."""
|
||||
encodings = get_html_media_encodings(
|
||||
encodings = _get_html_media_encodings(
|
||||
b"""
|
||||
<html>
|
||||
<head><meta charset="invalid">
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue