diff --git a/changelog.d/13056.feature b/changelog.d/13056.feature
new file mode 100644
index 000000000..219e2f6c1
--- /dev/null
+++ b/changelog.d/13056.feature
@@ -0,0 +1 @@
+Improve URL previews for sites which only provide Twitter Card metadata, e.g. LWN.net.
diff --git a/synapse/rest/media/v1/preview_html.py b/synapse/rest/media/v1/preview_html.py
index ed8f21a48..c826a1309 100644
--- a/synapse/rest/media/v1/preview_html.py
+++ b/synapse/rest/media/v1/preview_html.py
@@ -15,7 +15,16 @@ import codecs
import itertools
import logging
import re
-from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Union
+from typing import (
+ TYPE_CHECKING,
+ Callable,
+ Dict,
+ Generator,
+ Iterable,
+ Optional,
+ Set,
+ Union,
+)
if TYPE_CHECKING:
from lxml import etree
@@ -146,6 +155,70 @@ def decode_body(
return etree.fromstring(body, parser)
+def _get_meta_tags(
+ tree: "etree.Element",
+ property: str,
+ prefix: str,
+ property_mapper: Optional[Callable[[str], Optional[str]]] = None,
+) -> Dict[str, Optional[str]]:
+ """
+ Search for meta tags prefixed with a particular string.
+
+ Args:
+ tree: The parsed HTML document.
+ property: The name of the property which contains the tag name, e.g.
+ "property" for Open Graph.
+ prefix: The prefix on the property to search for, e.g. "og" for Open Graph.
+ property_mapper: An optional callable to map the property to the Open Graph
+ form. Can return None for a key to ignore that key.
+
+ Returns:
+ A map of tag name to value.
+ """
+ results: Dict[str, Optional[str]] = {}
+ for tag in tree.xpath(
+ f"//*/meta[starts-with(@{property}, '{prefix}:')][@content][not(@content='')]"
+ ):
+ # if we've got more than 50 tags, someone is taking the piss
+ if len(results) >= 50:
+ logger.warning(
+ "Skipping parsing of Open Graph for page with too many '%s:' tags",
+ prefix,
+ )
+ return {}
+
+ key = tag.attrib[property]
+ if property_mapper:
+ key = property_mapper(key)
+ # None is a special value used to ignore a value.
+ if key is None:
+ continue
+
+ results[key] = tag.attrib["content"]
+
+ return results
+
+
+def _map_twitter_to_open_graph(key: str) -> Optional[str]:
+ """
+ Map a Twitter card property to the analogous Open Graph property.
+
+ Args:
+ key: The Twitter card property (starts with "twitter:").
+
+ Returns:
+ The Open Graph property (starts with "og:") or None to have this property
+ be ignored.
+ """
+ # Twitter card properties with no analogous Open Graph property.
+ if key == "twitter:card" or key == "twitter:creator":
+ return None
+ if key == "twitter:site":
+ return "og:site_name"
+ # Otherwise, swap twitter to og.
+ return "og" + key[7:]
+
+
def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
"""
Parse the HTML document into an Open Graph response.
@@ -160,10 +233,8 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
The Open Graph response as a dictionary.
"""
- # if we see any image URLs in the OG response, then spider them
- # (although the client could choose to do this by asking for previews of those
- # URLs to avoid DoSing the server)
-
+ # Search for Open Graph (og:) meta tags, e.g.:
+ #
# "og:type" : "video",
# "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw",
# "og:site_name" : "YouTube",
@@ -176,19 +247,11 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
# "og:video:height" : "720",
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
- og: Dict[str, Optional[str]] = {}
- for tag in tree.xpath(
- "//*/meta[starts-with(@property, 'og:')][@content][not(@content='')]"
- ):
- # if we've got more than 50 tags, someone is taking the piss
- if len(og) >= 50:
- logger.warning("Skipping OG for page with too many 'og:' tags")
- return {}
-
- og[tag.attrib["property"]] = tag.attrib["content"]
-
- # TODO: grab article: meta tags too, e.g.:
+ og = _get_meta_tags(tree, "property", "og")
+ # TODO: Search for properties specific to the different Open Graph types,
+ # such as article: meta tags, e.g.:
+ #
# "article:publisher" : "https://www.facebook.com/thethudonline" />
# "article:author" content="https://www.facebook.com/thethudonline" />
# "article:tag" content="baby" />
@@ -196,6 +259,21 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
# "article:published_time" content="2016-03-31T19:58:24+00:00" />
# "article:modified_time" content="2016-04-01T18:31:53+00:00" />
+ # Search for Twitter Card (twitter:) meta tags, e.g.:
+ #
+ # "twitter:site" : "@matrixdotorg"
+ # "twitter:creator" : "@matrixdotorg"
+ #
+ # Twitter cards tags also duplicate Open Graph tags.
+ #
+ # See https://developer.twitter.com/en/docs/twitter-for-websites/cards/guides/getting-started
+ twitter = _get_meta_tags(tree, "name", "twitter", _map_twitter_to_open_graph)
+ # Merge the Twitter values with the Open Graph values, but do not overwrite
+ # information from Open Graph tags.
+ for key, value in twitter.items():
+ if key not in og:
+ og[key] = value
+
if "og:title" not in og:
# Attempt to find a title from the title tag, or the biggest header on the page.
title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()")
diff --git a/tests/rest/media/v1/test_html_preview.py b/tests/rest/media/v1/test_html_preview.py
index ea9e5889b..cbdf210ae 100644
--- a/tests/rest/media/v1/test_html_preview.py
+++ b/tests/rest/media/v1/test_html_preview.py
@@ -370,6 +370,47 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase):
og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "รณ", "og:description": "Some text."})
+ def test_twitter_tag(self) -> None:
+ """Twitter card tags should be used if nothing else is available."""
+ html = b"""
+
+
+
+
+
+ """
+ tree = decode_body(html, "http://example.com/test.html")
+ og = parse_html_to_open_graph(tree)
+ self.assertEqual(
+ og,
+ {
+ "og:title": None,
+ "og:description": "Description",
+ "og:site_name": "@matrixdotorg",
+ },
+ )
+
+ # But they shouldn't override Open Graph values.
+ html = b"""
+
+
+
+
+
+
+
+ """
+ tree = decode_body(html, "http://example.com/test.html")
+ og = parse_html_to_open_graph(tree)
+ self.assertEqual(
+ og,
+ {
+ "og:title": None,
+ "og:description": "Real Description",
+ "og:site_name": "matrix.org",
+ },
+ )
+
class MediaEncodingTestCase(unittest.TestCase):
def test_meta_charset(self) -> None: