Move HTML parsing to a separate file for URL previews. (#11566)

* Splits the logic for parsing HTML from the resource handling code.
* Fix a circular import in the oEmbed code (which uses the HTML parsing code).
* Renames some of the HTML parsing methods to:
  * Make it clear which methods are "internal" to the module.
  * Clarify what the methods do.
This commit is contained in:
Patrick Cloke 2021-12-13 12:55:07 -05:00 committed by GitHub
parent 5305a5e881
commit eb39da6782
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 432 additions and 401 deletions

View file

@ -12,10 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from synapse.rest.media.v1.preview_url_resource import (
_calc_og,
from synapse.rest.media.v1.preview_html import (
_get_html_media_encodings,
decode_body,
get_html_media_encodings,
parse_html_to_open_graph,
summarize_paragraphs,
)
@ -160,7 +160,7 @@ class CalcOgTestCase(unittest.TestCase):
"""
tree = decode_body(html, "http://example.com/test.html")
og = _calc_og(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
@ -176,7 +176,7 @@ class CalcOgTestCase(unittest.TestCase):
"""
tree = decode_body(html, "http://example.com/test.html")
og = _calc_og(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
@ -195,7 +195,7 @@ class CalcOgTestCase(unittest.TestCase):
"""
tree = decode_body(html, "http://example.com/test.html")
og = _calc_og(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
self.assertEqual(
og,
@ -217,7 +217,7 @@ class CalcOgTestCase(unittest.TestCase):
"""
tree = decode_body(html, "http://example.com/test.html")
og = _calc_og(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
@ -231,7 +231,7 @@ class CalcOgTestCase(unittest.TestCase):
"""
tree = decode_body(html, "http://example.com/test.html")
og = _calc_og(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
@ -246,7 +246,7 @@ class CalcOgTestCase(unittest.TestCase):
"""
tree = decode_body(html, "http://example.com/test.html")
og = _calc_og(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
@ -261,7 +261,7 @@ class CalcOgTestCase(unittest.TestCase):
"""
tree = decode_body(html, "http://example.com/test.html")
og = _calc_og(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
@ -289,7 +289,7 @@ class CalcOgTestCase(unittest.TestCase):
<head><title>Foo</title></head><body>Some text.</body></html>
""".strip()
tree = decode_body(html, "http://example.com/test.html")
og = _calc_og(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
def test_invalid_encoding(self):
@ -303,7 +303,7 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
tree = decode_body(html, "http://example.com/test.html", "invalid-encoding")
og = _calc_og(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
def test_invalid_encoding2(self):
@ -318,7 +318,7 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = _calc_og(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
def test_windows_1252(self):
@ -332,14 +332,14 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = _calc_og(tree, "http://example.com/test.html")
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
class MediaEncodingTestCase(unittest.TestCase):
def test_meta_charset(self):
"""A character encoding is found via the meta tag."""
encodings = get_html_media_encodings(
encodings = _get_html_media_encodings(
b"""
<html>
<head><meta charset="ascii">
@ -351,7 +351,7 @@ class MediaEncodingTestCase(unittest.TestCase):
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
# A less well-formed version.
encodings = get_html_media_encodings(
encodings = _get_html_media_encodings(
b"""
<html>
<head>< meta charset = ascii>
@ -364,7 +364,7 @@ class MediaEncodingTestCase(unittest.TestCase):
def test_meta_charset_underscores(self):
"""A character encoding contains underscore."""
encodings = get_html_media_encodings(
encodings = _get_html_media_encodings(
b"""
<html>
<head><meta charset="Shift_JIS">
@ -377,7 +377,7 @@ class MediaEncodingTestCase(unittest.TestCase):
def test_xml_encoding(self):
"""A character encoding is found via the meta tag."""
encodings = get_html_media_encodings(
encodings = _get_html_media_encodings(
b"""
<?xml version="1.0" encoding="ascii"?>
<html>
@ -389,7 +389,7 @@ class MediaEncodingTestCase(unittest.TestCase):
def test_meta_xml_encoding(self):
"""Meta tags take precedence over XML encoding."""
encodings = get_html_media_encodings(
encodings = _get_html_media_encodings(
b"""
<?xml version="1.0" encoding="ascii"?>
<html>
@ -413,17 +413,17 @@ class MediaEncodingTestCase(unittest.TestCase):
'text/html; charset=ascii";',
)
for header in headers:
encodings = get_html_media_encodings(b"", header)
encodings = _get_html_media_encodings(b"", header)
self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
def test_fallback(self):
"""A character encoding cannot be found in the body or header."""
encodings = get_html_media_encodings(b"", "text/html")
encodings = _get_html_media_encodings(b"", "text/html")
self.assertEqual(list(encodings), ["utf-8", "cp1252"])
def test_duplicates(self):
"""Ensure each encoding is only attempted once."""
encodings = get_html_media_encodings(
encodings = _get_html_media_encodings(
b"""
<?xml version="1.0" encoding="utf8"?>
<html>
@ -437,7 +437,7 @@ class MediaEncodingTestCase(unittest.TestCase):
def test_unknown_invalid(self):
"""A character encoding should be ignored if it is unknown or invalid."""
encodings = get_html_media_encodings(
encodings = _get_html_media_encodings(
b"""
<html>
<head><meta charset="invalid">