Improve URL previews for some pages (#12951)

* Skip `og` and `meta` tags where the value is empty.
* Fallback to the favicon if there are no other images.
* Ignore tags meant for navigation.
This commit is contained in:
Patrick Cloke 2022-06-03 12:09:12 -04:00 committed by GitHub
parent 888a29f412
commit 01df5bacac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 72 additions and 18 deletions

View File

@ -0,0 +1 @@
Improve URL previews for pages with empty elements.

View File

@ -30,6 +30,9 @@ _xml_encoding_match = re.compile(
)
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
# Certain elements aren't meant for display.
ARIA_ROLES_TO_IGNORE = {"directory", "menu", "menubar", "toolbar"}
def _normalise_encoding(encoding: str) -> Optional[str]:
"""Use the Python codec's name as the normalised entry."""
@ -174,13 +177,15 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
og: Dict[str, Optional[str]] = {}
for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
if "content" in tag.attrib:
# if we've got more than 50 tags, someone is taking the piss
if len(og) >= 50:
logger.warning("Skipping OG for page with too many 'og:' tags")
return {}
og[tag.attrib["property"]] = tag.attrib["content"]
for tag in tree.xpath(
"//*/meta[starts-with(@property, 'og:')][@content][not(@content='')]"
):
# if we've got more than 50 tags, someone is taking the piss
if len(og) >= 50:
logger.warning("Skipping OG for page with too many 'og:' tags")
return {}
og[tag.attrib["property"]] = tag.attrib["content"]
# TODO: grab article: meta tags too, e.g.:
@ -192,21 +197,23 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
# "article:modified_time" content="2016-04-01T18:31:53+00:00" />
if "og:title" not in og:
# do some basic spidering of the HTML
title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]")
if title and title[0].text is not None:
og["og:title"] = title[0].text.strip()
# Attempt to find a title from the title tag, or the biggest header on the page.
title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()")
if title:
og["og:title"] = title[0].strip()
else:
og["og:title"] = None
if "og:image" not in og:
# TODO: extract a favicon failing all else
meta_image = tree.xpath(
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]"
)
# If a meta image is found, use it.
if meta_image:
og["og:image"] = meta_image[0]
else:
# Try to find images which are larger than 10px by 10px.
#
# TODO: consider inlined CSS styles as well as width & height attribs
images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
images = sorted(
@ -215,17 +222,24 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
-1 * float(i.attrib["width"]) * float(i.attrib["height"])
),
)
# If no images were found, try to find *any* images.
if not images:
images = tree.xpath("//img[@src]")
images = tree.xpath("//img[@src][1]")
if images:
og["og:image"] = images[0].attrib["src"]
# Finally, fallback to the favicon if nothing else.
else:
favicons = tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]")
if favicons:
og["og:image"] = favicons[0]
if "og:description" not in og:
# Check the first meta description tag for content.
meta_description = tree.xpath(
"//*/meta"
"[translate(@name, 'DESCRIPTION', 'description')='description']"
"/@content"
"//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
)
# If a meta description is found with content, use it.
if meta_description:
og["og:description"] = meta_description[0]
else:
@ -306,6 +320,10 @@ def _iterate_over_text(
if isinstance(el, str):
yield el
elif el.tag not in tags_to_ignore:
# If the element isn't meant for display, ignore it.
if el.get("role") in ARIA_ROLES_TO_IGNORE:
continue
# el.text is the text before the first child, so we can immediately
# return it if the text exists.
if el.text:

View File

@ -145,7 +145,7 @@ class SummarizeTestCase(unittest.TestCase):
)
class CalcOgTestCase(unittest.TestCase):
class OpenGraphFromHtmlTestCase(unittest.TestCase):
if not lxml:
skip = "url preview feature requires lxml"
@ -235,6 +235,21 @@ class CalcOgTestCase(unittest.TestCase):
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
# Another variant is a title with no content.
html = b"""
<html>
<head><title></title></head>
<body>
<h1>Title</h1>
</body>
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "Title", "og:description": "Title"})
def test_h1_as_title(self) -> None:
html = b"""
<html>
@ -250,6 +265,26 @@ class CalcOgTestCase(unittest.TestCase):
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
def test_empty_description(self) -> None:
"""Description tags with empty content should be ignored."""
html = b"""
<html>
<meta property="og:description" content=""/>
<meta property="og:description"/>
<meta name="description" content=""/>
<meta name="description"/>
<meta name="description" content="Finally!"/>
<body>
<h1>Title</h1>
</body>
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "Title", "og:description": "Finally!"})
def test_missing_title_and_broken_h1(self) -> None:
html = b"""
<html>