From 1b112840d2c6dafa131eba4f0285409bb7345661 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <clokep@users.noreply.github.com>
Date: Fri, 8 Oct 2021 14:14:42 -0400
Subject: [PATCH] Autodiscover oEmbed endpoint from returned HTML (#10822)

Searches the returned HTML for an oEmbed endpoint using the
autodiscovery mechanism (`<link rel=...>`), and will request it
to generate the preview.
---
 changelog.d/10822.feature                     |   1 +
 synapse/rest/media/v1/oembed.py               |  26 +++++
 synapse/rest/media/v1/preview_url_resource.py | 108 ++++++++++++------
 tests/rest/media/v1/test_url_preview.py       | 100 +++++++++++++++-
 tests/test_preview.py                         |  40 ++++---
 5 files changed, 222 insertions(+), 53 deletions(-)
 create mode 100644 changelog.d/10822.feature
diff --git a/changelog.d/10822.feature b/changelog.d/10822.feature
new file mode 100644
index 000000000..72566e31e
--- /dev/null
+++ b/changelog.d/10822.feature
@@ -0,0 +1 @@
+Support autodiscovery of oEmbed previews.
diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py
index e04671fb9..6d7e1f906 100644
--- a/synapse/rest/media/v1/oembed.py
+++ b/synapse/rest/media/v1/oembed.py
@@ -96,6 +96,32 @@ class OEmbedProvider:
         # No match.
         return None
 
+    def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]:
+        """
+        Search an HTML document for oEmbed autodiscovery information.
+
+        Args:
+            tree: The parsed HTML body.
+
+        Returns:
+            The URL to use for oEmbed information, or None if no URL was found.
+        """
+        # Search for link elements with the proper rel and type attributes.
+        for tag in tree.xpath(
+            "//link[@rel='alternate'][@type='application/json+oembed']"
+        ):
+            if "href" in tag.attrib:
+                return tag.attrib["href"]
+
+        # Some providers (e.g. Flickr) use alternative instead of alternate.
+        for tag in tree.xpath(
+            "//link[@rel='alternative'][@type='application/json+oembed']"
+        ):
+            if "href" in tag.attrib:
+                return tag.attrib["href"]
+
+        return None
+
     def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
         """
         Parse the oEmbed response into an Open Graph response.
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index 044f44a39..1fe0fc8aa 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -22,7 +22,7 @@ import re
 import shutil
 import sys
 import traceback
-from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Union
+from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Tuple, Union
 from urllib import parse as urlparse
 
 import attr
@@ -296,22 +296,32 @@ class PreviewUrlResource(DirectServeJsonResource):
                 body = file.read()
 
             encoding = get_html_media_encoding(body, media_info.media_type)
-            og = decode_and_calc_og(body, media_info.uri, encoding)
+            tree = decode_body(body, encoding)
+            if tree is not None:
+                # Check if this HTML document points to oEmbed information and
+                # defer to that.
+                oembed_url = self._oembed.autodiscover_from_html(tree)
+                og = {}
+                if oembed_url:
+                    oembed_info = await self._download_url(oembed_url, user)
+                    og, expiration_ms = await self._handle_oembed_response(
+                        url, oembed_info, expiration_ms
+                    )
 
-            await self._precache_image_url(user, media_info, og)
+                # If there was no oEmbed URL (or oEmbed parsing failed), attempt
+                # to generate the Open Graph information from the HTML.
+                if not oembed_url or not og:
+                    og = _calc_og(tree, media_info.uri)
 
-        elif oembed_url and _is_json(media_info.media_type):
-            # Handle an oEmbed response.
-            with open(media_info.filename, "rb") as file:
-                body = file.read()
-
-            oembed_response = self._oembed.parse_oembed_response(url, body)
-            og = oembed_response.open_graph_result
-
-            # Use the cache age from the oEmbed result, instead of the HTTP response.
-            if oembed_response.cache_age is not None:
-                expiration_ms = oembed_response.cache_age
+                await self._precache_image_url(user, media_info, og)
+            else:
+                og = {}
 
+        elif oembed_url:
+            # Handle the oEmbed information.
+            og, expiration_ms = await self._handle_oembed_response(
+                url, media_info, expiration_ms
+            )
             await self._precache_image_url(user, media_info, og)
 
         else:
@@ -479,6 +489,39 @@ class PreviewUrlResource(DirectServeJsonResource):
         else:
             del og["og:image"]
 
+    async def _handle_oembed_response(
+        self, url: str, media_info: MediaInfo, expiration_ms: int
+    ) -> Tuple[JsonDict, int]:
+        """
+        Parse the downloaded oEmbed info.
+
+        Args:
+            url: The URL which is being previewed (not the one which was
+                requested).
+            media_info: The media being previewed.
+            expiration_ms: The length of time, in milliseconds, the media is valid for.
+
+        Returns:
+            A tuple of:
+                The Open Graph dictionary, if the oEmbed info can be parsed.
+                The (possibly updated) length of time, in milliseconds, the media is valid for.
+        """
+        # If JSON was not returned, there's nothing to do.
+        if not _is_json(media_info.media_type):
+            return {}, expiration_ms
+
+        with open(media_info.filename, "rb") as file:
+            body = file.read()
+
+        oembed_response = self._oembed.parse_oembed_response(url, body)
+        open_graph_result = oembed_response.open_graph_result
+
+        # Use the cache age from the oEmbed result, if one was given.
+        if open_graph_result and oembed_response.cache_age is not None:
+            expiration_ms = oembed_response.cache_age
+
+        return open_graph_result, expiration_ms
+
     def _start_expire_url_cache_data(self) -> Deferred:
         return run_as_background_process(
             "expire_url_cache_data", self._expire_url_cache_data
@@ -631,26 +674,22 @@ def get_html_media_encoding(body: bytes, content_type: str) -> str:
     return "utf-8"
 
 
-def decode_and_calc_og(
-    body: bytes, media_uri: str, request_encoding: Optional[str] = None
-) -> JsonDict:
+def decode_body(
+    body: bytes, request_encoding: Optional[str] = None
+) -> Optional["etree.Element"]:
     """
-    Calculate metadata for an HTML document.
-
-    This uses lxml to parse the HTML document into the OG response. If errors
-    occur during processing of the document, an empty response is returned.
+    This uses lxml to parse the HTML document.
 
     Args:
         body: The HTML document, as bytes.
-        media_url: The URI used to download the body.
         request_encoding: The character encoding of the body, as a string.
 
     Returns:
-        The OG response as a dictionary.
+        The parsed HTML body, or None if an error occurred during processed.
     """
     # If there's no body, nothing useful is going to be found.
     if not body:
-        return {}
+        return None
 
     from lxml import etree
 
@@ -662,25 +701,22 @@ def decode_and_calc_og(
         parser = etree.HTMLParser(recover=True, encoding="utf-8")
     except Exception as e:
         logger.warning("Unable to create HTML parser: %s" % (e,))
-        return {}
+        return None
 
-    def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
-        # Attempt to parse the body. If this fails, log and return no metadata.
-        tree = etree.fromstring(body_attempt, parser)
-
-        # The data was successfully parsed, but no tree was found.
-        if tree is None:
-            return {}
-
-        return _calc_og(tree, media_uri)
+    def _attempt_decode_body(
+        body_attempt: Union[bytes, str]
+    ) -> Optional["etree.Element"]:
+        # Attempt to parse the body. Returns None if the body was successfully
+        # parsed, but no tree was found.
+        return etree.fromstring(body_attempt, parser)
 
     # Attempt to parse the body. If this fails, log and return no metadata.
     try:
-        return _attempt_calc_og(body)
+        return _attempt_decode_body(body)
     except UnicodeDecodeError:
         # blindly try decoding the body as utf-8, which seems to fix
         # the charset mismatches on https://google.com
-        return _attempt_calc_og(body.decode("utf-8", "ignore"))
+        return _attempt_decode_body(body.decode("utf-8", "ignore"))
 
 
 def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py
index ce43de780..8698135a7 100644
--- a/tests/rest/media/v1/test_url_preview.py
+++ b/tests/rest/media/v1/test_url_preview.py
@@ -725,9 +725,107 @@ class URLPreviewTests(unittest.HomeserverTestCase):
             },
         )
 
+    def test_oembed_autodiscovery(self):
+        """
+        Autodiscovery works by finding the link in the HTML response and then requesting an oEmbed URL.
+        1. Request a preview of a URL which is not known to the oEmbed code.
+        2. It returns HTML including a link to an oEmbed preview.
+        3. The oEmbed preview is requested and returns a URL for an image.
+        4. The image is requested for thumbnailing.
+        """
+        # This is a little cheesy in that we use the www subdomain (which isn't the
+        # list of oEmbed patterns) to get "raw" HTML response.
+        self.lookups["www.twitter.com"] = [(IPv4Address, "10.1.2.3")]
+        self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
+        self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
+
+        result = b"""
+        <link rel="alternate" type="application/json+oembed"
+            href="http://publish.twitter.com/oembed?url=http%3A%2F%2Fcdn.twitter.com%2Fmatrixdotorg%2Fstatus%2F12345&format=json"
+            title="matrixdotorg" />
+        """
+
+        channel = self.make_request(
+            "GET",
+            "preview_url?url=http://www.twitter.com/matrixdotorg/status/12345",
+            shorthand=False,
+            await_result=False,
+        )
+        self.pump()
+
+        client = self.reactor.tcpClients[0][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b'Content-Type: text/html; charset="utf8"\r\n\r\n'
+            )
+            % (len(result),)
+            + result
+        )
+
+        self.pump()
+
+        # The oEmbed response.
+        result2 = {
+            "version": "1.0",
+            "type": "photo",
+            "url": "http://cdn.twitter.com/matrixdotorg",
+        }
+        oembed_content = json.dumps(result2).encode("utf-8")
+
+        # Ensure a second request is made to the oEmbed URL.
+        client = self.reactor.tcpClients[1][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b'Content-Type: application/json; charset="utf8"\r\n\r\n'
+            )
+            % (len(oembed_content),)
+            + oembed_content
+        )
+
+        self.pump()
+
+        # Ensure the URL is what was requested.
+        self.assertIn(b"/oembed?", server.data)
+
+        # Ensure a third request is made to the photo URL.
+        client = self.reactor.tcpClients[2][2].buildProtocol(None)
+        server = AccumulatingProtocol()
+        server.makeConnection(FakeTransport(client, self.reactor))
+        client.makeConnection(FakeTransport(server, self.reactor))
+        client.dataReceived(
+            (
+                b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
+                b"Content-Type: image/png\r\n\r\n"
+            )
+            % (len(SMALL_PNG),)
+            + SMALL_PNG
+        )
+
+        self.pump()
+
+        # Ensure the URL is what was requested.
+        self.assertIn(b"/matrixdotorg", server.data)
+
+        self.assertEqual(channel.code, 200)
+        body = channel.json_body
+        self.assertEqual(
+            body["og:url"], "http://www.twitter.com/matrixdotorg/status/12345"
+        )
+        self.assertTrue(body["og:image"].startswith("mxc://"))
+        self.assertEqual(body["og:image:height"], 1)
+        self.assertEqual(body["og:image:width"], 1)
+        self.assertEqual(body["og:image:type"], "image/png")
+
     def _download_image(self):
         """Downloads an image into the URL cache.
-
         Returns:
             A (host, media_id) tuple representing the MXC URI of the image.
         """
diff --git a/tests/test_preview.py b/tests/test_preview.py
index 48e792b55..09e017b4d 100644
--- a/tests/test_preview.py
+++ b/tests/test_preview.py
@@ -13,7 +13,8 @@
 # limitations under the License.
 
 from synapse.rest.media.v1.preview_url_resource import (
-    decode_and_calc_og,
+    _calc_og,
+    decode_body,
     get_html_media_encoding,
     summarize_paragraphs,
 )
@@ -158,7 +159,8 @@ class CalcOgTestCase(unittest.TestCase):
         </html>
         """
 
-        og = decode_and_calc_og(html, "http://example.com/test.html")
+        tree = decode_body(html)
+        og = _calc_og(tree, "http://example.com/test.html")
 
         self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
 
@@ -173,7 +175,8 @@ class CalcOgTestCase(unittest.TestCase):
         </html>
         """
 
-        og = decode_and_calc_og(html, "http://example.com/test.html")
+        tree = decode_body(html)
+        og = _calc_og(tree, "http://example.com/test.html")
 
         self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
 
@@ -191,7 +194,8 @@ class CalcOgTestCase(unittest.TestCase):
         </html>
         """
 
-        og = decode_and_calc_og(html, "http://example.com/test.html")
+        tree = decode_body(html)
+        og = _calc_og(tree, "http://example.com/test.html")
 
         self.assertEqual(
             og,
@@ -212,7 +216,8 @@ class CalcOgTestCase(unittest.TestCase):
         </html>
         """
 
-        og = decode_and_calc_og(html, "http://example.com/test.html")
+        tree = decode_body(html)
+        og = _calc_og(tree, "http://example.com/test.html")
 
         self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
 
@@ -225,7 +230,8 @@ class CalcOgTestCase(unittest.TestCase):
         </html>
         """
 
-        og = decode_and_calc_og(html, "http://example.com/test.html")
+        tree = decode_body(html)
+        og = _calc_og(tree, "http://example.com/test.html")
 
         self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
 
@@ -239,7 +245,8 @@ class CalcOgTestCase(unittest.TestCase):
         </html>
         """
 
-        og = decode_and_calc_og(html, "http://example.com/test.html")
+        tree = decode_body(html)
+        og = _calc_og(tree, "http://example.com/test.html")
 
         self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
 
@@ -253,21 +260,22 @@ class CalcOgTestCase(unittest.TestCase):
         </html>
         """
 
-        og = decode_and_calc_og(html, "http://example.com/test.html")
+        tree = decode_body(html)
+        og = _calc_og(tree, "http://example.com/test.html")
 
         self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
 
     def test_empty(self):
         """Test a body with no data in it."""
         html = b""
-        og = decode_and_calc_og(html, "http://example.com/test.html")
-        self.assertEqual(og, {})
+        tree = decode_body(html)
+        self.assertIsNone(tree)
 
     def test_no_tree(self):
         """A valid body with no tree in it."""
         html = b"\x00"
-        og = decode_and_calc_og(html, "http://example.com/test.html")
-        self.assertEqual(og, {})
+        tree = decode_body(html)
+        self.assertIsNone(tree)
 
     def test_invalid_encoding(self):
         """An invalid character encoding should be ignored and treated as UTF-8, if possible."""
@@ -279,9 +287,8 @@ class CalcOgTestCase(unittest.TestCase):
         </body>
         </html>
         """
-        og = decode_and_calc_og(
-            html, "http://example.com/test.html", "invalid-encoding"
-        )
+        tree = decode_body(html, "invalid-encoding")
+        og = _calc_og(tree, "http://example.com/test.html")
         self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
 
     def test_invalid_encoding2(self):
@@ -295,7 +302,8 @@ class CalcOgTestCase(unittest.TestCase):
         </body>
         </html>
         """
-        og = decode_and_calc_og(html, "http://example.com/test.html")
+        tree = decode_body(html)
+        og = _calc_og(tree, "http://example.com/test.html")
         self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})