From 6fc8be9a1b2046e69e8c6f731442887e3addeec0 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Wed, 22 Sep 2021 09:45:20 -0400 Subject: [PATCH] Include more information in oEmbed previews. (#10819) * Improved titles (fall back to the author name if there's not title) and include the site name. * Handle photo/video payloads. * Include the original URL in the Open Graph response. * Fix the expiration time (by properly converting from seconds to milliseconds). --- changelog.d/10819.feature | 1 + synapse/rest/media/v1/oembed.py | 49 +++++++++++++++++-- synapse/rest/media/v1/preview_url_resource.py | 2 +- tests/rest/media/v1/test_url_preview.py | 30 ++++++++---- 4 files changed, 68 insertions(+), 14 deletions(-) create mode 100644 changelog.d/10819.feature diff --git a/changelog.d/10819.feature b/changelog.d/10819.feature new file mode 100644 index 000000000..4fa95a6cc --- /dev/null +++ b/changelog.d/10819.feature @@ -0,0 +1 @@ +Improve oEmbed previews by processing the author name, photo, and video information. diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py index 8b74e7265..e04671fb9 100644 --- a/synapse/rest/media/v1/oembed.py +++ b/synapse/rest/media/v1/oembed.py @@ -13,7 +13,7 @@ # limitations under the License. import logging import urllib.parse -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, List, Optional import attr @@ -22,6 +22,8 @@ from synapse.types import JsonDict from synapse.util import json_decoder if TYPE_CHECKING: + from lxml import etree + from synapse.server import HomeServer logger = logging.getLogger(__name__) @@ -31,7 +33,7 @@ logger = logging.getLogger(__name__) class OEmbedResult: # The Open Graph result (converted from the oEmbed result). open_graph_result: JsonDict - # Number of seconds to cache the content, according to the oEmbed response. + # Number of milliseconds to cache the content, according to the oEmbed response. # # This will be None if no cache-age is provided in the oEmbed response (or # if the oEmbed response cannot be turned into an Open Graph response). @@ -119,10 +121,22 @@ class OEmbedProvider: # Ensure the cache age is None or an int. cache_age = oembed.get("cache_age") if cache_age: - cache_age = int(cache_age) + cache_age = int(cache_age) * 1000 # The results. - open_graph_response = {"og:title": oembed.get("title")} + open_graph_response = { + "og:url": url, + } + + # Use either title or author's name as the title. + title = oembed.get("title") or oembed.get("author_name") + if title: + open_graph_response["og:title"] = title + + # Use the provider name and as the site. + provider_name = oembed.get("provider_name") + if provider_name: + open_graph_response["og:site_name"] = provider_name # If a thumbnail exists, use it. Note that dimensions will be calculated later. if "thumbnail_url" in oembed: @@ -137,6 +151,15 @@ class OEmbedProvider: # If this is a photo, use the full image, not the thumbnail. open_graph_response["og:image"] = oembed["url"] + elif oembed_type == "video": + open_graph_response["og:type"] = "video.other" + calc_description_and_urls(open_graph_response, oembed["html"]) + open_graph_response["og:video:width"] = oembed["width"] + open_graph_response["og:video:height"] = oembed["height"] + + elif oembed_type == "link": + open_graph_response["og:type"] = "website" + else: raise RuntimeError(f"Unknown oEmbed type: {oembed_type}") @@ -149,6 +172,14 @@ class OEmbedProvider: return OEmbedResult(open_graph_response, cache_age) +def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]: + results = [] + for tag in tree.xpath("//*/" + tag_name): + if "src" in tag.attrib: + results.append(tag.attrib["src"]) + return results + + def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None: """ Calculate description for an HTML document. @@ -179,6 +210,16 @@ def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> if tree is None: return + # Attempt to find interesting URLs (images, videos, embeds). + if "og:image" not in open_graph_response: + image_urls = _fetch_urls(tree, "img") + if image_urls: + open_graph_response["og:image"] = image_urls[0] + + video_urls = _fetch_urls(tree, "video") + _fetch_urls(tree, "embed") + if video_urls: + open_graph_response["og:video"] = video_urls[0] + from synapse.rest.media.v1.preview_url_resource import _calc_description description = _calc_description(tree) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 0a0b476d2..9ffa983fb 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -305,7 +305,7 @@ class PreviewUrlResource(DirectServeJsonResource): with open(media_info.filename, "rb") as file: body = file.read() - oembed_response = self._oembed.parse_oembed_response(media_info.uri, body) + oembed_response = self._oembed.parse_oembed_response(url, body) og = oembed_response.open_graph_result # Use the cache age from the oEmbed result, instead of the HTTP response. diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index 9d1389958..d83dfacfe 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -620,11 +620,12 @@ class URLPreviewTests(unittest.HomeserverTestCase): self.assertIn(b"/matrixdotorg", server.data) self.assertEqual(channel.code, 200) - self.assertIsNone(channel.json_body["og:title"]) - self.assertTrue(channel.json_body["og:image"].startswith("mxc://")) - self.assertEqual(channel.json_body["og:image:height"], 1) - self.assertEqual(channel.json_body["og:image:width"], 1) - self.assertEqual(channel.json_body["og:image:type"], "image/png") + body = channel.json_body + self.assertEqual(body["og:url"], "http://twitter.com/matrixdotorg/status/12345") + self.assertTrue(body["og:image"].startswith("mxc://")) + self.assertEqual(body["og:image:height"], 1) + self.assertEqual(body["og:image:width"], 1) + self.assertEqual(body["og:image:type"], "image/png") def test_oembed_rich(self): """Test an oEmbed endpoint which returns HTML content via the 'rich' type.""" @@ -633,6 +634,8 @@ class URLPreviewTests(unittest.HomeserverTestCase): result = { "version": "1.0", "type": "rich", + # Note that this provides the author, not the title. + "author_name": "Alice", "html": "
Content Preview
", } end_content = json.dumps(result).encode("utf-8") @@ -660,9 +663,14 @@ class URLPreviewTests(unittest.HomeserverTestCase): self.pump() self.assertEqual(channel.code, 200) + body = channel.json_body self.assertEqual( - channel.json_body, - {"og:title": None, "og:description": "Content Preview"}, + body, + { + "og:url": "http://twitter.com/matrixdotorg/status/12345", + "og:title": "Alice", + "og:description": "Content Preview", + }, ) def test_oembed_format(self): @@ -705,7 +713,11 @@ class URLPreviewTests(unittest.HomeserverTestCase): self.assertIn(b"format=json", server.data) self.assertEqual(channel.code, 200) + body = channel.json_body self.assertEqual( - channel.json_body, - {"og:title": None, "og:description": "Content Preview"}, + body, + { + "og:url": "http://www.hulu.com/watch/12345", + "og:description": "Content Preview", + }, )