From 1b112840d2c6dafa131eba4f0285409bb7345661 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 8 Oct 2021 14:14:42 -0400 Subject: [PATCH] Autodiscover oEmbed endpoint from returned HTML (#10822) Searches the returned HTML for an oEmbed endpoint using the autodiscovery mechanism (``), and will request it to generate the preview. --- changelog.d/10822.feature | 1 + synapse/rest/media/v1/oembed.py | 26 +++++ synapse/rest/media/v1/preview_url_resource.py | 108 ++++++++++++------ tests/rest/media/v1/test_url_preview.py | 100 +++++++++++++++- tests/test_preview.py | 40 ++++--- 5 files changed, 222 insertions(+), 53 deletions(-) create mode 100644 changelog.d/10822.feature diff --git a/changelog.d/10822.feature b/changelog.d/10822.feature new file mode 100644 index 000000000..72566e31e --- /dev/null +++ b/changelog.d/10822.feature @@ -0,0 +1 @@ +Support autodiscovery of oEmbed previews. diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py index e04671fb9..6d7e1f906 100644 --- a/synapse/rest/media/v1/oembed.py +++ b/synapse/rest/media/v1/oembed.py @@ -96,6 +96,32 @@ class OEmbedProvider: # No match. return None + def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]: + """ + Search an HTML document for oEmbed autodiscovery information. + + Args: + tree: The parsed HTML body. + + Returns: + The URL to use for oEmbed information, or None if no URL was found. + """ + # Search for link elements with the proper rel and type attributes. + for tag in tree.xpath( + "//link[@rel='alternate'][@type='application/json+oembed']" + ): + if "href" in tag.attrib: + return tag.attrib["href"] + + # Some providers (e.g. Flickr) use alternative instead of alternate. + for tag in tree.xpath( + "//link[@rel='alternative'][@type='application/json+oembed']" + ): + if "href" in tag.attrib: + return tag.attrib["href"] + + return None + def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult: """ Parse the oEmbed response into an Open Graph response. diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 044f44a39..1fe0fc8aa 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -22,7 +22,7 @@ import re import shutil import sys import traceback -from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Union +from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Tuple, Union from urllib import parse as urlparse import attr @@ -296,22 +296,32 @@ class PreviewUrlResource(DirectServeJsonResource): body = file.read() encoding = get_html_media_encoding(body, media_info.media_type) - og = decode_and_calc_og(body, media_info.uri, encoding) + tree = decode_body(body, encoding) + if tree is not None: + # Check if this HTML document points to oEmbed information and + # defer to that. + oembed_url = self._oembed.autodiscover_from_html(tree) + og = {} + if oembed_url: + oembed_info = await self._download_url(oembed_url, user) + og, expiration_ms = await self._handle_oembed_response( + url, oembed_info, expiration_ms + ) - await self._precache_image_url(user, media_info, og) + # If there was no oEmbed URL (or oEmbed parsing failed), attempt + # to generate the Open Graph information from the HTML. + if not oembed_url or not og: + og = _calc_og(tree, media_info.uri) - elif oembed_url and _is_json(media_info.media_type): - # Handle an oEmbed response. - with open(media_info.filename, "rb") as file: - body = file.read() - - oembed_response = self._oembed.parse_oembed_response(url, body) - og = oembed_response.open_graph_result - - # Use the cache age from the oEmbed result, instead of the HTTP response. - if oembed_response.cache_age is not None: - expiration_ms = oembed_response.cache_age + await self._precache_image_url(user, media_info, og) + else: + og = {} + elif oembed_url: + # Handle the oEmbed information. + og, expiration_ms = await self._handle_oembed_response( + url, media_info, expiration_ms + ) await self._precache_image_url(user, media_info, og) else: @@ -479,6 +489,39 @@ class PreviewUrlResource(DirectServeJsonResource): else: del og["og:image"] + async def _handle_oembed_response( + self, url: str, media_info: MediaInfo, expiration_ms: int + ) -> Tuple[JsonDict, int]: + """ + Parse the downloaded oEmbed info. + + Args: + url: The URL which is being previewed (not the one which was + requested). + media_info: The media being previewed. + expiration_ms: The length of time, in milliseconds, the media is valid for. + + Returns: + A tuple of: + The Open Graph dictionary, if the oEmbed info can be parsed. + The (possibly updated) length of time, in milliseconds, the media is valid for. + """ + # If JSON was not returned, there's nothing to do. + if not _is_json(media_info.media_type): + return {}, expiration_ms + + with open(media_info.filename, "rb") as file: + body = file.read() + + oembed_response = self._oembed.parse_oembed_response(url, body) + open_graph_result = oembed_response.open_graph_result + + # Use the cache age from the oEmbed result, if one was given. + if open_graph_result and oembed_response.cache_age is not None: + expiration_ms = oembed_response.cache_age + + return open_graph_result, expiration_ms + def _start_expire_url_cache_data(self) -> Deferred: return run_as_background_process( "expire_url_cache_data", self._expire_url_cache_data @@ -631,26 +674,22 @@ def get_html_media_encoding(body: bytes, content_type: str) -> str: return "utf-8" -def decode_and_calc_og( - body: bytes, media_uri: str, request_encoding: Optional[str] = None -) -> JsonDict: +def decode_body( + body: bytes, request_encoding: Optional[str] = None +) -> Optional["etree.Element"]: """ - Calculate metadata for an HTML document. - - This uses lxml to parse the HTML document into the OG response. If errors - occur during processing of the document, an empty response is returned. + This uses lxml to parse the HTML document. Args: body: The HTML document, as bytes. - media_url: The URI used to download the body. request_encoding: The character encoding of the body, as a string. Returns: - The OG response as a dictionary. + The parsed HTML body, or None if an error occurred during processed. """ # If there's no body, nothing useful is going to be found. if not body: - return {} + return None from lxml import etree @@ -662,25 +701,22 @@ def decode_and_calc_og( parser = etree.HTMLParser(recover=True, encoding="utf-8") except Exception as e: logger.warning("Unable to create HTML parser: %s" % (e,)) - return {} + return None - def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]: - # Attempt to parse the body. If this fails, log and return no metadata. - tree = etree.fromstring(body_attempt, parser) - - # The data was successfully parsed, but no tree was found. - if tree is None: - return {} - - return _calc_og(tree, media_uri) + def _attempt_decode_body( + body_attempt: Union[bytes, str] + ) -> Optional["etree.Element"]: + # Attempt to parse the body. Returns None if the body was successfully + # parsed, but no tree was found. + return etree.fromstring(body_attempt, parser) # Attempt to parse the body. If this fails, log and return no metadata. try: - return _attempt_calc_og(body) + return _attempt_decode_body(body) except UnicodeDecodeError: # blindly try decoding the body as utf-8, which seems to fix # the charset mismatches on https://google.com - return _attempt_calc_og(body.decode("utf-8", "ignore")) + return _attempt_decode_body(body.decode("utf-8", "ignore")) def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]: diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index ce43de780..8698135a7 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -725,9 +725,107 @@ class URLPreviewTests(unittest.HomeserverTestCase): }, ) + def test_oembed_autodiscovery(self): + """ + Autodiscovery works by finding the link in the HTML response and then requesting an oEmbed URL. + 1. Request a preview of a URL which is not known to the oEmbed code. + 2. It returns HTML including a link to an oEmbed preview. + 3. The oEmbed preview is requested and returns a URL for an image. + 4. The image is requested for thumbnailing. + """ + # This is a little cheesy in that we use the www subdomain (which isn't the + # list of oEmbed patterns) to get "raw" HTML response. + self.lookups["www.twitter.com"] = [(IPv4Address, "10.1.2.3")] + self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")] + self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")] + + result = b""" + + """ + + channel = self.make_request( + "GET", + "preview_url?url=http://www.twitter.com/matrixdotorg/status/12345", + shorthand=False, + await_result=False, + ) + self.pump() + + client = self.reactor.tcpClients[0][2].buildProtocol(None) + server = AccumulatingProtocol() + server.makeConnection(FakeTransport(client, self.reactor)) + client.makeConnection(FakeTransport(server, self.reactor)) + client.dataReceived( + ( + b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" + b'Content-Type: text/html; charset="utf8"\r\n\r\n' + ) + % (len(result),) + + result + ) + + self.pump() + + # The oEmbed response. + result2 = { + "version": "1.0", + "type": "photo", + "url": "http://cdn.twitter.com/matrixdotorg", + } + oembed_content = json.dumps(result2).encode("utf-8") + + # Ensure a second request is made to the oEmbed URL. + client = self.reactor.tcpClients[1][2].buildProtocol(None) + server = AccumulatingProtocol() + server.makeConnection(FakeTransport(client, self.reactor)) + client.makeConnection(FakeTransport(server, self.reactor)) + client.dataReceived( + ( + b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" + b'Content-Type: application/json; charset="utf8"\r\n\r\n' + ) + % (len(oembed_content),) + + oembed_content + ) + + self.pump() + + # Ensure the URL is what was requested. + self.assertIn(b"/oembed?", server.data) + + # Ensure a third request is made to the photo URL. + client = self.reactor.tcpClients[2][2].buildProtocol(None) + server = AccumulatingProtocol() + server.makeConnection(FakeTransport(client, self.reactor)) + client.makeConnection(FakeTransport(server, self.reactor)) + client.dataReceived( + ( + b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" + b"Content-Type: image/png\r\n\r\n" + ) + % (len(SMALL_PNG),) + + SMALL_PNG + ) + + self.pump() + + # Ensure the URL is what was requested. + self.assertIn(b"/matrixdotorg", server.data) + + self.assertEqual(channel.code, 200) + body = channel.json_body + self.assertEqual( + body["og:url"], "http://www.twitter.com/matrixdotorg/status/12345" + ) + self.assertTrue(body["og:image"].startswith("mxc://")) + self.assertEqual(body["og:image:height"], 1) + self.assertEqual(body["og:image:width"], 1) + self.assertEqual(body["og:image:type"], "image/png") + def _download_image(self): """Downloads an image into the URL cache. - Returns: A (host, media_id) tuple representing the MXC URI of the image. """ diff --git a/tests/test_preview.py b/tests/test_preview.py index 48e792b55..09e017b4d 100644 --- a/tests/test_preview.py +++ b/tests/test_preview.py @@ -13,7 +13,8 @@ # limitations under the License. from synapse.rest.media.v1.preview_url_resource import ( - decode_and_calc_og, + _calc_og, + decode_body, get_html_media_encoding, summarize_paragraphs, ) @@ -158,7 +159,8 @@ class CalcOgTestCase(unittest.TestCase): """ - og = decode_and_calc_og(html, "http://example.com/test.html") + tree = decode_body(html) + og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) @@ -173,7 +175,8 @@ class CalcOgTestCase(unittest.TestCase): """ - og = decode_and_calc_og(html, "http://example.com/test.html") + tree = decode_body(html) + og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) @@ -191,7 +194,8 @@ class CalcOgTestCase(unittest.TestCase): """ - og = decode_and_calc_og(html, "http://example.com/test.html") + tree = decode_body(html) + og = _calc_og(tree, "http://example.com/test.html") self.assertEqual( og, @@ -212,7 +216,8 @@ class CalcOgTestCase(unittest.TestCase): """ - og = decode_and_calc_og(html, "http://example.com/test.html") + tree = decode_body(html) + og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) @@ -225,7 +230,8 @@ class CalcOgTestCase(unittest.TestCase): """ - og = decode_and_calc_og(html, "http://example.com/test.html") + tree = decode_body(html) + og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) @@ -239,7 +245,8 @@ class CalcOgTestCase(unittest.TestCase): """ - og = decode_and_calc_og(html, "http://example.com/test.html") + tree = decode_body(html) + og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."}) @@ -253,21 +260,22 @@ class CalcOgTestCase(unittest.TestCase): """ - og = decode_and_calc_og(html, "http://example.com/test.html") + tree = decode_body(html) + og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) def test_empty(self): """Test a body with no data in it.""" html = b"" - og = decode_and_calc_og(html, "http://example.com/test.html") - self.assertEqual(og, {}) + tree = decode_body(html) + self.assertIsNone(tree) def test_no_tree(self): """A valid body with no tree in it.""" html = b"\x00" - og = decode_and_calc_og(html, "http://example.com/test.html") - self.assertEqual(og, {}) + tree = decode_body(html) + self.assertIsNone(tree) def test_invalid_encoding(self): """An invalid character encoding should be ignored and treated as UTF-8, if possible.""" @@ -279,9 +287,8 @@ class CalcOgTestCase(unittest.TestCase): """ - og = decode_and_calc_og( - html, "http://example.com/test.html", "invalid-encoding" - ) + tree = decode_body(html, "invalid-encoding") + og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) def test_invalid_encoding2(self): @@ -295,7 +302,8 @@ class CalcOgTestCase(unittest.TestCase): """ - og = decode_and_calc_og(html, "http://example.com/test.html") + tree = decode_body(html) + og = _calc_og(tree, "http://example.com/test.html") self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})