Autodiscover oEmbed endpoint from returned HTML (#10822)

Searches the returned HTML for an oEmbed endpoint using the autodiscovery mechanism (`<link rel=...>`), and will request it to generate the preview.
2025-08-15 08:35:25 -04:00 · 2021-10-08 14:14:42 -04:00 · 2021-10-08 14:14:42 -04:00 · 1b112840d2
commit 1b112840d2
parent 593eeac19e
5 changed files with 222 additions and 53 deletions
--- a/changelog.d/10822.feature
+++ b/changelog.d/10822.feature
@ -0,0 +1 @@
 Support autodiscovery of oEmbed previews.
--- a/synapse/rest/media/v1/oembed.py
+++ b/synapse/rest/media/v1/oembed.py
@ -96,6 +96,32 @@ class OEmbedProvider:
        # No match.
        return None
    def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]:
        """
        Search an HTML document for oEmbed autodiscovery information.
        Args:
            tree: The parsed HTML body.
        Returns:
            The URL to use for oEmbed information, or None if no URL was found.
        """
        # Search for link elements with the proper rel and type attributes.
        for tag in tree.xpath(
            "//link[@rel='alternate'][@type='application/json+oembed']"
        ):
            if "href" in tag.attrib:
                return tag.attrib["href"]
        # Some providers (e.g. Flickr) use alternative instead of alternate.
        for tag in tree.xpath(
            "//link[@rel='alternative'][@type='application/json+oembed']"
        ):
            if "href" in tag.attrib:
                return tag.attrib["href"]
        return None
    def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
        """
        Parse the oEmbed response into an Open Graph response.
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@ -22,7 +22,7 @@ import re
 import shutil
 import sys
 import traceback
-from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Union
+from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Tuple, Union
 from urllib import parse as urlparse
 import attr
@ -296,22 +296,32 @@ class PreviewUrlResource(DirectServeJsonResource):
                body = file.read()
            encoding = get_html_media_encoding(body, media_info.media_type)
-            og = decode_and_calc_og(body, media_info.uri, encoding)
+            tree = decode_body(body, encoding)
            if tree is not None:
                # Check if this HTML document points to oEmbed information and
                # defer to that.
                oembed_url = self._oembed.autodiscover_from_html(tree)
                og = {}
                if oembed_url:
                    oembed_info = await self._download_url(oembed_url, user)
                    og, expiration_ms = await self._handle_oembed_response(
                        url, oembed_info, expiration_ms
                    )
-            await self._precache_image_url(user, media_info, og)
+                # If there was no oEmbed URL (or oEmbed parsing failed), attempt
                # to generate the Open Graph information from the HTML.
                if not oembed_url or not og:
                    og = _calc_og(tree, media_info.uri)
-        elif oembed_url and _is_json(media_info.media_type):
+                await self._precache_image_url(user, media_info, og)
-            # Handle an oEmbed response.
+            else:
-            with open(media_info.filename, "rb") as file:
+                og = {}
                body = file.read()
            oembed_response = self._oembed.parse_oembed_response(url, body)
            og = oembed_response.open_graph_result
            # Use the cache age from the oEmbed result, instead of the HTTP response.
            if oembed_response.cache_age is not None:
                expiration_ms = oembed_response.cache_age
        elif oembed_url:
            # Handle the oEmbed information.
            og, expiration_ms = await self._handle_oembed_response(
                url, media_info, expiration_ms
            )
            await self._precache_image_url(user, media_info, og)
        else:
@ -479,6 +489,39 @@ class PreviewUrlResource(DirectServeJsonResource):
        else:
            del og["og:image"]
    async def _handle_oembed_response(
        self, url: str, media_info: MediaInfo, expiration_ms: int
    ) -> Tuple[JsonDict, int]:
        """
        Parse the downloaded oEmbed info.
        Args:
            url: The URL which is being previewed (not the one which was
                requested).
            media_info: The media being previewed.
            expiration_ms: The length of time, in milliseconds, the media is valid for.
        Returns:
            A tuple of:
                The Open Graph dictionary, if the oEmbed info can be parsed.
                The (possibly updated) length of time, in milliseconds, the media is valid for.
        """
        # If JSON was not returned, there's nothing to do.
        if not _is_json(media_info.media_type):
            return {}, expiration_ms
        with open(media_info.filename, "rb") as file:
            body = file.read()
        oembed_response = self._oembed.parse_oembed_response(url, body)
        open_graph_result = oembed_response.open_graph_result
        # Use the cache age from the oEmbed result, if one was given.
        if open_graph_result and oembed_response.cache_age is not None:
            expiration_ms = oembed_response.cache_age
        return open_graph_result, expiration_ms
    def _start_expire_url_cache_data(self) -> Deferred:
        return run_as_background_process(
            "expire_url_cache_data", self._expire_url_cache_data
@ -631,26 +674,22 @@ def get_html_media_encoding(body: bytes, content_type: str) -> str:
    return "utf-8"
-def decode_and_calc_og(
+def decode_body(
-    body: bytes, media_uri: str, request_encoding: Optional[str] = None
+    body: bytes, request_encoding: Optional[str] = None
-) -> JsonDict:
+) -> Optional["etree.Element"]:
    """
-    Calculate metadata for an HTML document.
+    This uses lxml to parse the HTML document.
    This uses lxml to parse the HTML document into the OG response. If errors
    occur during processing of the document, an empty response is returned.
    Args:
        body: The HTML document, as bytes.
        media_url: The URI used to download the body.
        request_encoding: The character encoding of the body, as a string.
    Returns:
-        The OG response as a dictionary.
+        The parsed HTML body, or None if an error occurred during processed.
    """
    # If there's no body, nothing useful is going to be found.
    if not body:
-        return {}
+        return None
    from lxml import etree
@ -662,25 +701,22 @@ def decode_and_calc_og(
        parser = etree.HTMLParser(recover=True, encoding="utf-8")
    except Exception as e:
        logger.warning("Unable to create HTML parser: %s" % (e,))
-        return {}
+        return None
-    def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
+    def _attempt_decode_body(
-        # Attempt to parse the body. If this fails, log and return no metadata.
+        body_attempt: Union[bytes, str]
-        tree = etree.fromstring(body_attempt, parser)
+    ) -> Optional["etree.Element"]:
-
+        # Attempt to parse the body. Returns None if the body was successfully
-        # The data was successfully parsed, but no tree was found.
+        # parsed, but no tree was found.
-        if tree is None:
+        return etree.fromstring(body_attempt, parser)
            return {}
        return _calc_og(tree, media_uri)
    # Attempt to parse the body. If this fails, log and return no metadata.
    try:
-        return _attempt_calc_og(body)
+        return _attempt_decode_body(body)
    except UnicodeDecodeError:
        # blindly try decoding the body as utf-8, which seems to fix
        # the charset mismatches on https://google.com
-        return _attempt_calc_og(body.decode("utf-8", "ignore"))
+        return _attempt_decode_body(body.decode("utf-8", "ignore"))
 def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
--- a/tests/rest/media/v1/test_url_preview.py
+++ b/tests/rest/media/v1/test_url_preview.py
@ -725,9 +725,107 @@ class URLPreviewTests(unittest.HomeserverTestCase):
            },
        )
    def test_oembed_autodiscovery(self):
        """
        Autodiscovery works by finding the link in the HTML response and then requesting an oEmbed URL.
        1. Request a preview of a URL which is not known to the oEmbed code.
        2. It returns HTML including a link to an oEmbed preview.
        3. The oEmbed preview is requested and returns a URL for an image.
        4. The image is requested for thumbnailing.
        """
        # This is a little cheesy in that we use the www subdomain (which isn't the
        # list of oEmbed patterns) to get "raw" HTML response.
        self.lookups["www.twitter.com"] = [(IPv4Address, "10.1.2.3")]
        self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
        self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
        result = b"""
        <link rel="alternate" type="application/json+oembed"
            href="http://publish.twitter.com/oembed?url=http%3A%2F%2Fcdn.twitter.com%2Fmatrixdotorg%2Fstatus%2F12345&format=json"
            title="matrixdotorg" />
        """
        channel = self.make_request(
            "GET",
            "preview_url?url=http://www.twitter.com/matrixdotorg/status/12345",
            shorthand=False,
            await_result=False,
        )
        self.pump()
        client = self.reactor.tcpClients[0][2].buildProtocol(None)
        server = AccumulatingProtocol()
        server.makeConnection(FakeTransport(client, self.reactor))
        client.makeConnection(FakeTransport(server, self.reactor))
        client.dataReceived(
            (
                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
                b'Content-Type: text/html; charset="utf8"\r\n\r\n'
            )
            % (len(result),)
            + result
        )
        self.pump()
        # The oEmbed response.
        result2 = {
            "version": "1.0",
            "type": "photo",
            "url": "http://cdn.twitter.com/matrixdotorg",
        }
        oembed_content = json.dumps(result2).encode("utf-8")
        # Ensure a second request is made to the oEmbed URL.
        client = self.reactor.tcpClients[1][2].buildProtocol(None)
        server = AccumulatingProtocol()
        server.makeConnection(FakeTransport(client, self.reactor))
        client.makeConnection(FakeTransport(server, self.reactor))
        client.dataReceived(
            (
                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
                b'Content-Type: application/json; charset="utf8"\r\n\r\n'
            )
            % (len(oembed_content),)
            + oembed_content
        )
        self.pump()
        # Ensure the URL is what was requested.
        self.assertIn(b"/oembed?", server.data)
        # Ensure a third request is made to the photo URL.
        client = self.reactor.tcpClients[2][2].buildProtocol(None)
        server = AccumulatingProtocol()
        server.makeConnection(FakeTransport(client, self.reactor))
        client.makeConnection(FakeTransport(server, self.reactor))
        client.dataReceived(
            (
                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
                b"Content-Type: image/png\r\n\r\n"
            )
            % (len(SMALL_PNG),)
            + SMALL_PNG
        )
        self.pump()
        # Ensure the URL is what was requested.
        self.assertIn(b"/matrixdotorg", server.data)
        self.assertEqual(channel.code, 200)
        body = channel.json_body
        self.assertEqual(
            body["og:url"], "http://www.twitter.com/matrixdotorg/status/12345"
        )
        self.assertTrue(body["og:image"].startswith("mxc://"))
        self.assertEqual(body["og:image:height"], 1)
        self.assertEqual(body["og:image:width"], 1)
        self.assertEqual(body["og:image:type"], "image/png")
    def _download_image(self):
        """Downloads an image into the URL cache.
        Returns:
            A (host, media_id) tuple representing the MXC URI of the image.
        """
--- a/tests/test_preview.py
+++ b/tests/test_preview.py
@ -13,7 +13,8 @@
 # limitations under the License.
 from synapse.rest.media.v1.preview_url_resource import (
-    decode_and_calc_og,
+    _calc_og,
    decode_body,
    get_html_media_encoding,
    summarize_paragraphs,
 )
@ -158,7 +159,8 @@ class CalcOgTestCase(unittest.TestCase):
        </html>
        """
-        og = decode_and_calc_og(html, "http://example.com/test.html")
+        tree = decode_body(html)
        og = _calc_og(tree, "http://example.com/test.html")
        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
@ -173,7 +175,8 @@ class CalcOgTestCase(unittest.TestCase):
        </html>
        """
-        og = decode_and_calc_og(html, "http://example.com/test.html")
+        tree = decode_body(html)
        og = _calc_og(tree, "http://example.com/test.html")
        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
@ -191,7 +194,8 @@ class CalcOgTestCase(unittest.TestCase):
        </html>
        """
-        og = decode_and_calc_og(html, "http://example.com/test.html")
+        tree = decode_body(html)
        og = _calc_og(tree, "http://example.com/test.html")
        self.assertEqual(
            og,
@ -212,7 +216,8 @@ class CalcOgTestCase(unittest.TestCase):
        </html>
        """
-        og = decode_and_calc_og(html, "http://example.com/test.html")
+        tree = decode_body(html)
        og = _calc_og(tree, "http://example.com/test.html")
        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
@ -225,7 +230,8 @@ class CalcOgTestCase(unittest.TestCase):
        </html>
        """
-        og = decode_and_calc_og(html, "http://example.com/test.html")
+        tree = decode_body(html)
        og = _calc_og(tree, "http://example.com/test.html")
        self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
@ -239,7 +245,8 @@ class CalcOgTestCase(unittest.TestCase):
        </html>
        """
-        og = decode_and_calc_og(html, "http://example.com/test.html")
+        tree = decode_body(html)
        og = _calc_og(tree, "http://example.com/test.html")
        self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
@ -253,21 +260,22 @@ class CalcOgTestCase(unittest.TestCase):
        </html>
        """
-        og = decode_and_calc_og(html, "http://example.com/test.html")
+        tree = decode_body(html)
        og = _calc_og(tree, "http://example.com/test.html")
        self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
    def test_empty(self):
        """Test a body with no data in it."""
        html = b""
-        og = decode_and_calc_og(html, "http://example.com/test.html")
+        tree = decode_body(html)
-        self.assertEqual(og, {})
+        self.assertIsNone(tree)
    def test_no_tree(self):
        """A valid body with no tree in it."""
        html = b"\x00"
-        og = decode_and_calc_og(html, "http://example.com/test.html")
+        tree = decode_body(html)
-        self.assertEqual(og, {})
+        self.assertIsNone(tree)
    def test_invalid_encoding(self):
        """An invalid character encoding should be ignored and treated as UTF-8, if possible."""
@ -279,9 +287,8 @@ class CalcOgTestCase(unittest.TestCase):
        </body>
        </html>
        """
-        og = decode_and_calc_og(
+        tree = decode_body(html, "invalid-encoding")
-            html, "http://example.com/test.html", "invalid-encoding"
+        og = _calc_og(tree, "http://example.com/test.html")
        )
        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
    def test_invalid_encoding2(self):
@ -295,7 +302,8 @@ class CalcOgTestCase(unittest.TestCase):
        </body>
        </html>
        """
-        og = decode_and_calc_og(html, "http://example.com/test.html")
+        tree = decode_body(html)
        og = _calc_og(tree, "http://example.com/test.html")
        self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})