Improve URL previews for some pages (#12951)

* Skip `og` and `meta` tags where the value is empty.
* Fallback to the favicon if there are no other images.
* Ignore tags meant for navigation.
This commit is contained in:
Patrick Cloke 2022-06-03 12:09:12 -04:00 committed by GitHub
parent 888a29f412
commit 01df5bacac
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 72 additions and 18 deletions

View file

@ -145,7 +145,7 @@ class SummarizeTestCase(unittest.TestCase):
)
class CalcOgTestCase(unittest.TestCase):
class OpenGraphFromHtmlTestCase(unittest.TestCase):
if not lxml:
skip = "url preview feature requires lxml"
@ -235,6 +235,21 @@ class CalcOgTestCase(unittest.TestCase):
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
# Another variant is a title with no content.
html = b"""
<html>
<head><title></title></head>
<body>
<h1>Title</h1>
</body>
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "Title", "og:description": "Title"})
def test_h1_as_title(self) -> None:
html = b"""
<html>
@ -250,6 +265,26 @@ class CalcOgTestCase(unittest.TestCase):
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
def test_empty_description(self) -> None:
"""Description tags with empty content should be ignored."""
html = b"""
<html>
<meta property="og:description" content=""/>
<meta property="og:description"/>
<meta name="description" content=""/>
<meta name="description"/>
<meta name="description" content="Finally!"/>
<body>
<h1>Title</h1>
</body>
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "Title", "og:description": "Finally!"})
def test_missing_title_and_broken_h1(self) -> None:
html = b"""
<html>